exec.c

   1 /*
   2  *  Virtual page mapping
   3  *
   4  *  Copyright (c) 2003 Fabrice Bellard
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19 #include "qemu/osdep.h"
  20 #include "qapi/error.h"
  21 #ifndef _WIN32
  22 #endif
  23
  24 #include "qemu/cutils.h"
  25 #include "cpu.h"
  26 #include "exec/exec-all.h"
  27 #include "tcg.h"
  28 #include "hw/qdev-core.h"
  29 #if !defined(CONFIG_USER_ONLY)
  30 #include "hw/boards.h"
  31 #include "hw/xen/xen.h"
  32 #endif
  33 #include "sysemu/kvm.h"
  34 #include "sysemu/hax.h"
  35 #include "sysemu/sysemu.h"
  36 #include "qemu/timer.h"
  37 #include "qemu/config-file.h"
  38 #include "qemu/error-report.h"
  39 #if defined(CONFIG_USER_ONLY)
  40 #include "qemu.h"
  41 #else /* !CONFIG_USER_ONLY */
  42 #include "hw/hw.h"
  43 #include "exec/memory.h"
  44 #include "exec/ioport.h"
  45 #include "sysemu/dma.h"
  46 #include "exec/address-spaces.h"
  47 #include "sysemu/xen-mapcache.h"
  48 #include "trace.h"
  49 #endif
  50 #include "exec/cpu-all.h"
  51 #include "qemu/rcu_queue.h"
  52 #include "qemu/main-loop.h"
  53 #include "translate-all.h"
  54 #include "sysemu/replay.h"
  55
  56 #include "exec/memory-internal.h"
  57 #include "exec/ram_addr.h"
  58 #include "exec/log.h"
  59
  60 #include "migration/vmstate.h"
  61
  62 #include "qemu/range.h"
  63 #ifndef _WIN32
  64 #include "qemu/mmap-alloc.h"
  65 #endif
  66
  67 //#define DEBUG_SUBPAGE
  68
  69 #if !defined(CONFIG_USER_ONLY)
  70 /* ram_list is read under rcu_read_lock()/rcu_read_unlock().  Writes
  71  * are protected by the ramlist lock.
  72  */
  73 RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
  74
  75 static MemoryRegion *system_memory;
  76 static MemoryRegion *system_io;
  77
  78 AddressSpace address_space_io;
  79 AddressSpace address_space_memory;
  80
  81 MemoryRegion io_mem_rom, io_mem_notdirty;
  82 static MemoryRegion io_mem_unassigned;
  83
  84 /* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */
  85 #define RAM_PREALLOC   (1 << 0)
  86
  87 /* RAM is mmap-ed with MAP_SHARED */
  88 #define RAM_SHARED     (1 << 1)
  89
  90 /* Only a portion of RAM (used_length) is actually used, and migrated.
  91  * This used_length size can change across reboots.
  92  */
  93 #define RAM_RESIZEABLE (1 << 2)
  94
  95 #endif
  96
  97 #ifdef TARGET_PAGE_BITS_VARY
  98 int target_page_bits;
  99 bool target_page_bits_decided;
 100 #endif
 101
 102 struct CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
 103 /* current CPU in the current thread. It is only valid inside
 104    cpu_exec() */
 105 __thread CPUState *current_cpu;
 106 /* 0 = Do not count executed instructions.
 107    1 = Precise instruction counting.
 108    2 = Adaptive rate instruction counting.  */
 109 int use_icount;
 110
 111 bool set_preferred_target_page_bits(int bits)
 112 {
 113     /* The target page size is the lowest common denominator for all
 114      * the CPUs in the system, so we can only make it smaller, never
 115      * larger. And we can't make it smaller once we've committed to
 116      * a particular size.
 117      */
 118 #ifdef TARGET_PAGE_BITS_VARY
 119     assert(bits >= TARGET_PAGE_BITS_MIN);
 120     if (target_page_bits == 0 || target_page_bits > bits) {
 121         if (target_page_bits_decided) {
 122             return false;
 123         }
 124         target_page_bits = bits;
 125     }
 126 #endif
 127     return true;
 128 }
 129
 130 #if !defined(CONFIG_USER_ONLY)
 131
 132 static void finalize_target_page_bits(void)
 133 {
 134 #ifdef TARGET_PAGE_BITS_VARY
 135     if (target_page_bits == 0) {
 136         target_page_bits = TARGET_PAGE_BITS_MIN;
 137     }
 138     target_page_bits_decided = true;
 139 #endif
 140 }
 141
 142 typedef struct PhysPageEntry PhysPageEntry;
 143
 144 struct PhysPageEntry {
 145     /* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
 146     uint32_t skip : 6;
 147      /* index into phys_sections (!skip) or phys_map_nodes (skip) */
 148     uint32_t ptr : 26;
 149 };
 150
 151 #define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)
 152
 153 /* Size of the L2 (and L3, etc) page tables.  */
 154 #define ADDR_SPACE_BITS 64
 155
 156 #define P_L2_BITS 9
 157 #define P_L2_SIZE (1 << P_L2_BITS)
 158
 159 #define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
 160
 161 typedef PhysPageEntry Node[P_L2_SIZE];
 162
 163 typedef struct PhysPageMap {
 164     struct rcu_head rcu;
 165
 166     unsigned sections_nb;
 167     unsigned sections_nb_alloc;
 168     unsigned nodes_nb;
 169     unsigned nodes_nb_alloc;
 170     Node *nodes;
 171     MemoryRegionSection *sections;
 172 } PhysPageMap;
 173
 174 struct AddressSpaceDispatch {
 175     struct rcu_head rcu;
 176
 177     MemoryRegionSection *mru_section;
 178     /* This is a multi-level map on the physical address space.
 179      * The bottom level has pointers to MemoryRegionSections.
 180      */
 181     PhysPageEntry phys_map;
 182     PhysPageMap map;
 183     AddressSpace *as;
 184 };
 185
 186 #define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
 187 typedef struct subpage_t {
 188     MemoryRegion iomem;
 189     AddressSpace *as;
 190     hwaddr base;
 191     uint16_t sub_section[];
 192 } subpage_t;
 193
 194 #define PHYS_SECTION_UNASSIGNED 0
 195 #define PHYS_SECTION_NOTDIRTY 1
 196 #define PHYS_SECTION_ROM 2
 197 #define PHYS_SECTION_WATCH 3
 198
 199 static void io_mem_init(void);
 200 static void memory_map_init(void);
 201 static void tcg_commit(MemoryListener *listener);
 202
 203 static MemoryRegion io_mem_watch;
 204
 205 /**
 206  * CPUAddressSpace: all the information a CPU needs about an AddressSpace
 207  * @cpu: the CPU whose AddressSpace this is
 208  * @as: the AddressSpace itself
 209  * @memory_dispatch: its dispatch pointer (cached, RCU protected)
 210  * @tcg_as_listener: listener for tracking changes to the AddressSpace
 211  */
 212 struct CPUAddressSpace {
 213     CPUState *cpu;
 214     AddressSpace *as;
 215     struct AddressSpaceDispatch *memory_dispatch;
 216     MemoryListener tcg_as_listener;
 217 };
 218
 219 #endif
 220
 221 #if !defined(CONFIG_USER_ONLY)
 222
 223 static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
 224 {
 225     static unsigned alloc_hint = 16;
 226     if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
 227         map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, alloc_hint);
 228         map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, map->nodes_nb + nodes);
 229         map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
 230         alloc_hint = map->nodes_nb_alloc;
 231     }
 232 }
 233
 234 static uint32_t phys_map_node_alloc(PhysPageMap *map, bool leaf)
 235 {
 236     unsigned i;
 237     uint32_t ret;
 238     PhysPageEntry e;
 239     PhysPageEntry *p;
 240
 241     ret = map->nodes_nb++;
 242     p = map->nodes[ret];
 243     assert(ret != PHYS_MAP_NODE_NIL);
 244     assert(ret != map->nodes_nb_alloc);
 245
 246     e.skip = leaf ? 0 : 1;
 247     e.ptr = leaf ? PHYS_SECTION_UNASSIGNED : PHYS_MAP_NODE_NIL;
 248     for (i = 0; i < P_L2_SIZE; ++i) {
 249         memcpy(&p[i], &e, sizeof(e));
 250     }
 251     return ret;
 252 }
 253
 254 static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp,
 255                                 hwaddr *index, hwaddr *nb, uint16_t leaf,
 256                                 int level)
 257 {
 258     PhysPageEntry *p;
 259     hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
 260
 261     if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
 262         lp->ptr = phys_map_node_alloc(map, level == 0);
 263     }
 264     p = map->nodes[lp->ptr];
 265     lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)];
 266
 267     while (*nb && lp < &p[P_L2_SIZE]) {
 268         if ((*index & (step - 1)) == 0 && *nb >= step) {
 269             lp->skip = 0;
 270             lp->ptr = leaf;
 271             *index += step;
 272             *nb -= step;
 273         } else {
 274             phys_page_set_level(map, lp, index, nb, leaf, level - 1);
 275         }
 276         ++lp;
 277     }
 278 }
 279
 280 static void phys_page_set(AddressSpaceDispatch *d,
 281                           hwaddr index, hwaddr nb,
 282                           uint16_t leaf)
 283 {
 284     /* Wildly overreserve - it doesn't matter much. */
 285     phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
 286
 287     phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
 288 }
 289
 290 /* Compact a non leaf page entry. Simply detect that the entry has a single child,
 291  * and update our entry so we can skip it and go directly to the destination.
 292  */
 293 static void phys_page_compact(PhysPageEntry *lp, Node *nodes)
 294 {
 295     unsigned valid_ptr = P_L2_SIZE;
 296     int valid = 0;
 297     PhysPageEntry *p;
 298     int i;
 299
 300     if (lp->ptr == PHYS_MAP_NODE_NIL) {
 301         return;
 302     }
 303
 304     p = nodes[lp->ptr];
 305     for (i = 0; i < P_L2_SIZE; i++) {
 306         if (p[i].ptr == PHYS_MAP_NODE_NIL) {
 307             continue;
 308         }
 309
 310         valid_ptr = i;
 311         valid++;
 312         if (p[i].skip) {
 313             phys_page_compact(&p[i], nodes);
 314         }
 315     }
 316
 317     /* We can only compress if there's only one child. */
 318     if (valid != 1) {
 319         return;
 320     }
 321
 322     assert(valid_ptr < P_L2_SIZE);
 323
 324     /* Don't compress if it won't fit in the # of bits we have. */
 325     if (lp->skip + p[valid_ptr].skip >= (1 << 3)) {
 326         return;
 327     }
 328
 329     lp->ptr = p[valid_ptr].ptr;
 330     if (!p[valid_ptr].skip) {
 331         /* If our only child is a leaf, make this a leaf. */
 332         /* By design, we should have made this node a leaf to begin with so we
 333          * should never reach here.
 334          * But since it's so simple to handle this, let's do it just in case we
 335          * change this rule.
 336          */
 337         lp->skip = 0;
 338     } else {
 339         lp->skip += p[valid_ptr].skip;
 340     }
 341 }
 342
 343 static void phys_page_compact_all(AddressSpaceDispatch *d, int nodes_nb)
 344 {
 345     if (d->phys_map.skip) {
 346         phys_page_compact(&d->phys_map, d->map.nodes);
 347     }
 348 }
 349
 350 static inline bool section_covers_addr(const MemoryRegionSection *section,
 351                                        hwaddr addr)
 352 {
 353     /* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means
 354      * the section must cover the entire address space.
 355      */
 356     return int128_gethi(section->size) ||
 357            range_covers_byte(section->offset_within_address_space,
 358                              int128_getlo(section->size), addr);
 359 }
 360
 361 static MemoryRegionSection *phys_page_find(PhysPageEntry lp, hwaddr addr,
 362                                            Node *nodes, MemoryRegionSection *sections)
 363 {
 364     PhysPageEntry *p;
 365     hwaddr index = addr >> TARGET_PAGE_BITS;
 366     int i;
 367
 368     for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
 369         if (lp.ptr == PHYS_MAP_NODE_NIL) {
 370             return &sections[PHYS_SECTION_UNASSIGNED];
 371         }
 372         p = nodes[lp.ptr];
 373         lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
 374     }
 375
 376     if (section_covers_addr(&sections[lp.ptr], addr)) {
 377         return &sections[lp.ptr];
 378     } else {
 379         return &sections[PHYS_SECTION_UNASSIGNED];
 380     }
 381 }
 382
 383 bool memory_region_is_unassigned(MemoryRegion *mr)
 384 {
 385     return mr != &io_mem_rom && mr != &io_mem_notdirty && !mr->rom_device
 386         && mr != &io_mem_watch;
 387 }
 388
 389 /* Called from RCU critical section */
 390 static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
 391                                                         hwaddr addr,
 392                                                         bool resolve_subpage)
 393 {
 394     MemoryRegionSection *section = atomic_read(&d->mru_section);
 395     subpage_t *subpage;
 396     bool update;
 397
 398     if (section && section != &d->map.sections[PHYS_SECTION_UNASSIGNED] &&
 399         section_covers_addr(section, addr)) {
 400         update = false;
 401     } else {
 402         section = phys_page_find(d->phys_map, addr, d->map.nodes,
 403                                  d->map.sections);
 404         update = true;
 405     }
 406     if (resolve_subpage && section->mr->subpage) {
 407         subpage = container_of(section->mr, subpage_t, iomem);
 408         section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
 409     }
 410     if (update) {
 411         atomic_set(&d->mru_section, section);
 412     }
 413     return section;
 414 }
 415
 416 /* Called from RCU critical section */
 417 static MemoryRegionSection *
 418 address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *xlat,
 419                                  hwaddr *plen, bool resolve_subpage)
 420 {
 421     MemoryRegionSection *section;
 422     MemoryRegion *mr;
 423     Int128 diff;
 424
 425     section = address_space_lookup_region(d, addr, resolve_subpage);
 426     /* Compute offset within MemoryRegionSection */
 427     addr -= section->offset_within_address_space;
 428
 429     /* Compute offset within MemoryRegion */
 430     *xlat = addr + section->offset_within_region;
 431
 432     mr = section->mr;
 433
 434     /* MMIO registers can be expected to perform full-width accesses based only
 435      * on their address, without considering adjacent registers that could
 436      * decode to completely different MemoryRegions.  When such registers
 437      * exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
 438      * regions overlap wildly.  For this reason we cannot clamp the accesses
 439      * here.
 440      *
 441      * If the length is small (as is the case for address_space_ldl/stl),
 442      * everything works fine.  If the incoming length is large, however,
 443      * the caller really has to do the clamping through memory_access_size.
 444      */
 445     if (memory_region_is_ram(mr)) {
 446         diff = int128_sub(section->size, int128_make64(addr));
 447         *plen = int128_get64(int128_min(diff, int128_make64(*plen)));
 448     }
 449     return section;
 450 }
 451
 452 /* Called from RCU critical section */
 453 MemoryRegion *address_space_translate(AddressSpace *as, hwaddr addr,
 454                                       hwaddr *xlat, hwaddr *plen,
 455                                       bool is_write)
 456 {
 457     IOMMUTLBEntry iotlb;
 458     MemoryRegionSection *section;
 459     MemoryRegion *mr;
 460
 461     for (;;) {
 462         AddressSpaceDispatch *d = atomic_rcu_read(&as->dispatch);
 463         section = address_space_translate_internal(d, addr, &addr, plen, true);
 464         mr = section->mr;
 465
 466         if (!mr->iommu_ops) {
 467             break;
 468         }
 469
 470         iotlb = mr->iommu_ops->translate(mr, addr, is_write);
 471         addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
 472                 | (addr & iotlb.addr_mask));
 473         *plen = MIN(*plen, (addr | iotlb.addr_mask) - addr + 1);
 474         if (!(iotlb.perm & (1 << is_write))) {
 475             mr = &io_mem_unassigned;
 476             break;
 477         }
 478
 479         as = iotlb.target_as;
 480     }
 481
 482     if (xen_enabled() && memory_access_is_direct(mr, is_write)) {
 483         hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
 484         *plen = MIN(page, *plen);
 485     }
 486
 487     *xlat = addr;
 488     return mr;
 489 }
 490
 491 /* Called from RCU critical section */
 492 MemoryRegionSection *
 493 address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr,
 494                                   hwaddr *xlat, hwaddr *plen)
 495 {
 496     MemoryRegionSection *section;
 497     AddressSpaceDispatch *d = atomic_rcu_read(&cpu->cpu_ases[asidx].memory_dispatch);
 498
 499     section = address_space_translate_internal(d, addr, xlat, plen, false);
 500
 501     assert(!section->mr->iommu_ops);
 502     return section;
 503 }
 504 #endif
 505
 506 #if !defined(CONFIG_USER_ONLY)
 507
 508 static int cpu_common_post_load(void *opaque, int version_id)
 509 {
 510     CPUState *cpu = opaque;
 511
 512     /* 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the
 513        version_id is increased. */
 514     cpu->interrupt_request &= ~0x01;
 515     tlb_flush(cpu, 1);
 516
 517     return 0;
 518 }
 519
 520 static int cpu_common_pre_load(void *opaque)
 521 {
 522     CPUState *cpu = opaque;
 523
 524     cpu->exception_index = -1;
 525
 526     return 0;
 527 }
 528
 529 static bool cpu_common_exception_index_needed(void *opaque)
 530 {
 531     CPUState *cpu = opaque;
 532
 533     return tcg_enabled() && cpu->exception_index != -1;
 534 }
 535
 536 static const VMStateDescription vmstate_cpu_common_exception_index = {
 537     .name = "cpu_common/exception_index",
 538     .version_id = 1,
 539     .minimum_version_id = 1,
 540     .needed = cpu_common_exception_index_needed,
 541     .fields = (VMStateField[]) {
 542         VMSTATE_INT32(exception_index, CPUState),
 543         VMSTATE_END_OF_LIST()
 544     }
 545 };
 546
 547 static bool cpu_common_crash_occurred_needed(void *opaque)
 548 {
 549     CPUState *cpu = opaque;
 550
 551     return cpu->crash_occurred;
 552 }
 553
 554 static const VMStateDescription vmstate_cpu_common_crash_occurred = {
 555     .name = "cpu_common/crash_occurred",
 556     .version_id = 1,
 557     .minimum_version_id = 1,
 558     .needed = cpu_common_crash_occurred_needed,
 559     .fields = (VMStateField[]) {
 560         VMSTATE_BOOL(crash_occurred, CPUState),
 561         VMSTATE_END_OF_LIST()
 562     }
 563 };
 564
 565 const VMStateDescription vmstate_cpu_common = {
 566     .name = "cpu_common",
 567     .version_id = 1,
 568     .minimum_version_id = 1,
 569     .pre_load = cpu_common_pre_load,
 570     .post_load = cpu_common_post_load,
 571     .fields = (VMStateField[]) {
 572         VMSTATE_UINT32(halted, CPUState),
 573         VMSTATE_UINT32(interrupt_request, CPUState),
 574         VMSTATE_END_OF_LIST()
 575     },
 576     .subsections = (const VMStateDescription*[]) {
 577         &vmstate_cpu_common_exception_index,
 578         &vmstate_cpu_common_crash_occurred,
 579         NULL
 580     }
 581 };
 582
 583 #endif
 584
 585 CPUState *qemu_get_cpu(int index)
 586 {
 587     CPUState *cpu;
 588
 589     CPU_FOREACH(cpu) {
 590         if (cpu->cpu_index == index) {
 591             return cpu;
 592         }
 593     }
 594
 595     return NULL;
 596 }
 597
 598 #if !defined(CONFIG_USER_ONLY)
 599 void cpu_address_space_init(CPUState *cpu, AddressSpace *as, int asidx)
 600 {
 601     CPUAddressSpace *newas;
 602
 603     /* Target code should have set num_ases before calling us */
 604     assert(asidx < cpu->num_ases);
 605
 606     if (asidx == 0) {
 607         /* address space 0 gets the convenience alias */
 608         cpu->as = as;
 609     }
 610
 611     /* KVM cannot currently support multiple address spaces. */
 612     assert(asidx == 0 || !kvm_enabled());
 613
 614     if (!cpu->cpu_ases) {
 615         cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases);
 616     }
 617
 618     newas = &cpu->cpu_ases[asidx];
 619     newas->cpu = cpu;
 620     newas->as = as;
 621     if (tcg_enabled()) {
 622         newas->tcg_as_listener.commit = tcg_commit;
 623         memory_listener_register(&newas->tcg_as_listener, as);
 624     }
 625 }
 626
 627 AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx)
 628 {
 629     /* Return the AddressSpace corresponding to the specified index */
 630     return cpu->cpu_ases[asidx].as;
 631 }
 632 #endif
 633
 634 void cpu_exec_unrealizefn(CPUState *cpu)
 635 {
 636     CPUClass *cc = CPU_GET_CLASS(cpu);
 637
 638     cpu_list_remove(cpu);
 639
 640     if (cc->vmsd != NULL) {
 641         vmstate_unregister(NULL, cc->vmsd, cpu);
 642     }
 643     if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
 644         vmstate_unregister(NULL, &vmstate_cpu_common, cpu);
 645     }
 646 }
 647
 648 void cpu_exec_initfn(CPUState *cpu)
 649 {
 650 #ifdef TARGET_WORDS_BIGENDIAN
 651     cpu->bigendian = true;
 652 #else
 653     cpu->bigendian = false;
 654 #endif
 655     cpu->as = NULL;
 656     cpu->num_ases = 0;
 657
 658 #ifndef CONFIG_USER_ONLY
 659     cpu->thread_id = qemu_get_thread_id();
 660
 661     /* This is a softmmu CPU object, so create a property for it
 662      * so users can wire up its memory. (This can't go in qom/cpu.c
 663      * because that file is compiled only once for both user-mode
 664      * and system builds.) The default if no link is set up is to use
 665      * the system address space.
 666      */
 667     object_property_add_link(OBJECT(cpu), "memory", TYPE_MEMORY_REGION,
 668                              (Object **)&cpu->memory,
 669                              qdev_prop_allow_set_link_before_realize,
 670                              OBJ_PROP_LINK_UNREF_ON_RELEASE,
 671                              &error_abort);
 672     cpu->memory = system_memory;
 673     object_ref(OBJECT(cpu->memory));
 674 #endif
 675 }
 676
 677 void cpu_exec_realizefn(CPUState *cpu, Error **errp)
 678 {
 679     CPUClass *cc ATTRIBUTE_UNUSED = CPU_GET_CLASS(cpu);
 680
 681     cpu_list_add(cpu);
 682
 683 #ifndef CONFIG_USER_ONLY
 684     if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
 685         vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
 686     }
 687     if (cc->vmsd != NULL) {
 688         vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu);
 689     }
 690 #endif
 691 }
 692
 693 #if defined(CONFIG_USER_ONLY)
 694 static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
 695 {
 696     mmap_lock();
 697     tb_lock();
 698     tb_invalidate_phys_page_range(pc, pc + 1, 0);
 699     tb_unlock();
 700     mmap_unlock();
 701 }
 702 #else
 703 static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
 704 {
 705     MemTxAttrs attrs;
 706     hwaddr phys = cpu_get_phys_page_attrs_debug(cpu, pc, &attrs);
 707     int asidx = cpu_asidx_from_attrs(cpu, attrs);
 708     if (phys != -1) {
 709         /* Locks grabbed by tb_invalidate_phys_addr */
 710         tb_invalidate_phys_addr(cpu->cpu_ases[asidx].as,
 711                                 phys | (pc & ~TARGET_PAGE_MASK));
 712     }
 713 }
 714 #endif
 715
 716 #if defined(CONFIG_USER_ONLY)
 717 void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
 718
 719 {
 720 }
 721
 722 int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
 723                           int flags)
 724 {
 725     return -ENOSYS;
 726 }
 727
 728 void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
 729 {
 730 }
 731
 732 int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
 733                           int flags, CPUWatchpoint **watchpoint)
 734 {
 735     return -ENOSYS;
 736 }
 737 #else
 738 /* Add a watchpoint.  */
 739 int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
 740                           int flags, CPUWatchpoint **watchpoint)
 741 {
 742     CPUWatchpoint *wp;
 743
 744     /* forbid ranges which are empty or run off the end of the address space */
 745     if (len == 0 || (addr + len - 1) < addr) {
 746         error_report("tried to set invalid watchpoint at %"
 747                      VADDR_PRIx ", len=%" VADDR_PRIu, addr, len);
 748         return -EINVAL;
 749     }
 750     wp = g_malloc(sizeof(*wp));
 751
 752     wp->vaddr = addr;
 753     wp->len = len;
 754     wp->flags = flags;
 755
 756     /* keep all GDB-injected watchpoints in front */
 757     if (flags & BP_GDB) {
 758         QTAILQ_INSERT_HEAD(&cpu->watchpoints, wp, entry);
 759     } else {
 760         QTAILQ_INSERT_TAIL(&cpu->watchpoints, wp, entry);
 761     }
 762
 763     tlb_flush_page(cpu, addr);
 764
 765     if (watchpoint)
 766         *watchpoint = wp;
 767     return 0;
 768 }
 769
 770 /* Remove a specific watchpoint.  */
 771 int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
 772                           int flags)
 773 {
 774     CPUWatchpoint *wp;
 775
 776     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
 777         if (addr == wp->vaddr && len == wp->len
 778                 && flags == (wp->flags & ~BP_WATCHPOINT_HIT)) {
 779             cpu_watchpoint_remove_by_ref(cpu, wp);
 780             return 0;
 781         }
 782     }
 783     return -ENOENT;
 784 }
 785
 786 /* Remove a specific watchpoint by reference.  */
 787 void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
 788 {
 789     QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry);
 790
 791     tlb_flush_page(cpu, watchpoint->vaddr);
 792
 793     g_free(watchpoint);
 794 }
 795
 796 /* Remove all matching watchpoints.  */
 797 void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
 798 {
 799     CPUWatchpoint *wp, *next;
 800
 801     QTAILQ_FOREACH_SAFE(wp, &cpu->watchpoints, entry, next) {
 802         if (wp->flags & mask) {
 803             cpu_watchpoint_remove_by_ref(cpu, wp);
 804         }
 805     }
 806 }
 807
 808 /* Return true if this watchpoint address matches the specified
 809  * access (ie the address range covered by the watchpoint overlaps
 810  * partially or completely with the address range covered by the
 811  * access).
 812  */
 813 static inline bool cpu_watchpoint_address_matches(CPUWatchpoint *wp,
 814                                                   vaddr addr,
 815                                                   vaddr len)
 816 {
 817     /* We know the lengths are non-zero, but a little caution is
 818      * required to avoid errors in the case where the range ends
 819      * exactly at the top of the address space and so addr + len
 820      * wraps round to zero.
 821      */
 822     vaddr wpend = wp->vaddr + wp->len - 1;
 823     vaddr addrend = addr + len - 1;
 824
 825     return !(addr > wpend || wp->vaddr > addrend);
 826 }
 827
 828 #endif
 829
 830 /* Add a breakpoint.  */
 831 int cpu_breakpoint_insert(CPUState *cpu, vaddr pc, int flags,
 832                           CPUBreakpoint **breakpoint)
 833 {
 834     CPUBreakpoint *bp;
 835
 836     bp = g_malloc(sizeof(*bp));
 837
 838     bp->pc = pc;
 839     bp->flags = flags;
 840
 841     /* keep all GDB-injected breakpoints in front */
 842     if (flags & BP_GDB) {
 843         QTAILQ_INSERT_HEAD(&cpu->breakpoints, bp, entry);
 844     } else {
 845         QTAILQ_INSERT_TAIL(&cpu->breakpoints, bp, entry);
 846     }
 847
 848     breakpoint_invalidate(cpu, pc);
 849
 850     if (breakpoint) {
 851         *breakpoint = bp;
 852     }
 853     return 0;
 854 }
 855
 856 /* Remove a specific breakpoint.  */
 857 int cpu_breakpoint_remove(CPUState *cpu, vaddr pc, int flags)
 858 {
 859     CPUBreakpoint *bp;
 860
 861     QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
 862         if (bp->pc == pc && bp->flags == flags) {
 863             cpu_breakpoint_remove_by_ref(cpu, bp);
 864             return 0;
 865         }
 866     }
 867     return -ENOENT;
 868 }
 869
 870 /* Remove a specific breakpoint by reference.  */
 871 void cpu_breakpoint_remove_by_ref(CPUState *cpu, CPUBreakpoint *breakpoint)
 872 {
 873     QTAILQ_REMOVE(&cpu->breakpoints, breakpoint, entry);
 874
 875     breakpoint_invalidate(cpu, breakpoint->pc);
 876
 877     g_free(breakpoint);
 878 }
 879
 880 /* Remove all matching breakpoints. */
 881 void cpu_breakpoint_remove_all(CPUState *cpu, int mask)
 882 {
 883     CPUBreakpoint *bp, *next;
 884
 885     QTAILQ_FOREACH_SAFE(bp, &cpu->breakpoints, entry, next) {
 886         if (bp->flags & mask) {
 887             cpu_breakpoint_remove_by_ref(cpu, bp);
 888         }
 889     }
 890 }
 891
 892 /* enable or disable single step mode. EXCP_DEBUG is returned by the
 893    CPU loop after each instruction */
 894 void cpu_single_step(CPUState *cpu, int enabled)
 895 {
 896     if (cpu->singlestep_enabled != enabled) {
 897         cpu->singlestep_enabled = enabled;
 898         if (kvm_enabled()) {
 899             kvm_update_guest_debug(cpu, 0);
 900         } else {
 901             /* must flush all the translated code to avoid inconsistencies */
 902             /* XXX: only flush what is necessary */
 903             tb_flush(cpu);
 904         }
 905     }
 906 }
 907
 908 void QEMU_NORETURN cpu_abort(CPUState *cpu, const char *fmt, ...)
 909 {
 910     va_list ap;
 911     va_list ap2;
 912
 913     va_start(ap, fmt);
 914     va_copy(ap2, ap);
 915     fprintf(stderr, "qemu: fatal: ");
 916     vfprintf(stderr, fmt, ap);
 917     fprintf(stderr, "\n");
 918     cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU | CPU_DUMP_CCOP);
 919     if (qemu_log_separate()) {
 920         qemu_log_lock();
 921         qemu_log("qemu: fatal: ");
 922         qemu_log_vprintf(fmt, ap2);
 923         qemu_log("\n");
 924         log_cpu_state(cpu, CPU_DUMP_FPU | CPU_DUMP_CCOP);
 925         qemu_log_flush();
 926         qemu_log_unlock();
 927         qemu_log_close();
 928     }
 929     va_end(ap2);
 930     va_end(ap);
 931     replay_finish();
 932 #if defined(CONFIG_USER_ONLY)
 933     {
 934         struct sigaction act;
 935         sigfillset(&act.sa_mask);
 936         act.sa_handler = SIG_DFL;
 937         sigaction(SIGABRT, &act, NULL);
 938     }
 939 #endif
 940     abort();
 941 }
 942
 943 #if !defined(CONFIG_USER_ONLY)
 944 /* Called from RCU critical section */
 945 static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
 946 {
 947     RAMBlock *block;
 948
 949     block = atomic_rcu_read(&ram_list.mru_block);
 950     if (block && addr - block->offset < block->max_length) {
 951         return block;
 952     }
 953     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 954         if (addr - block->offset < block->max_length) {
 955             goto found;
 956         }
 957     }
 958
 959     fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
 960     abort();
 961
 962 found:
 963     /* It is safe to write mru_block outside the iothread lock.  This
 964      * is what happens:
 965      *
 966      *     mru_block = xxx
 967      *     rcu_read_unlock()
 968      *                                        xxx removed from list
 969      *                  rcu_read_lock()
 970      *                  read mru_block
 971      *                                        mru_block = NULL;
 972      *                                        call_rcu(reclaim_ramblock, xxx);
 973      *                  rcu_read_unlock()
 974      *
 975      * atomic_rcu_set is not needed here.  The block was already published
 976      * when it was placed into the list.  Here we're just making an extra
 977      * copy of the pointer.
 978      */
 979     ram_list.mru_block = block;
 980     return block;
 981 }
 982
 983 static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
 984 {
 985     CPUState *cpu;
 986     ram_addr_t start1;
 987     RAMBlock *block;
 988     ram_addr_t end;
 989
 990     end = TARGET_PAGE_ALIGN(start + length);
 991     start &= TARGET_PAGE_MASK;
 992
 993     rcu_read_lock();
 994     block = qemu_get_ram_block(start);
 995     assert(block == qemu_get_ram_block(end - 1));
 996     start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
 997     CPU_FOREACH(cpu) {
 998         tlb_reset_dirty(cpu, start1, length);
 999     }
1000     rcu_read_unlock();
1001 }
1002
1003 /* Note: start and end must be within the same ram block.  */
1004 bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
1005                                               ram_addr_t length,
1006                                               unsigned client)
1007 {
1008     DirtyMemoryBlocks *blocks;
1009     unsigned long end, page;
1010     bool dirty = false;
1011
1012     if (length == 0) {
1013         return false;
1014     }
1015
1016     end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
1017     page = start >> TARGET_PAGE_BITS;
1018
1019     rcu_read_lock();
1020
1021     blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
1022
1023     while (page < end) {
1024         unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
1025         unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
1026         unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
1027
1028         dirty |= bitmap_test_and_clear_atomic(blocks->blocks[idx],
1029                                               offset, num);
1030         page += num;
1031     }
1032
1033     rcu_read_unlock();
1034
1035     if (dirty && tcg_enabled()) {
1036         tlb_reset_dirty_range_all(start, length);
1037     }
1038
1039     return dirty;
1040 }
1041
1042 /* Called from RCU critical section */
1043 hwaddr memory_region_section_get_iotlb(CPUState *cpu,
1044                                        MemoryRegionSection *section,
1045                                        target_ulong vaddr,
1046                                        hwaddr paddr, hwaddr xlat,
1047                                        int prot,
1048                                        target_ulong *address)
1049 {
1050     hwaddr iotlb;
1051     CPUWatchpoint *wp;
1052
1053     if (memory_region_is_ram(section->mr)) {
1054         /* Normal RAM.  */
1055         iotlb = memory_region_get_ram_addr(section->mr) + xlat;
1056         if (!section->readonly) {
1057             iotlb |= PHYS_SECTION_NOTDIRTY;
1058         } else {
1059             iotlb |= PHYS_SECTION_ROM;
1060         }
1061     } else {
1062         AddressSpaceDispatch *d;
1063
1064         d = atomic_rcu_read(&section->address_space->dispatch);
1065         iotlb = section - d->map.sections;
1066         iotlb += xlat;
1067     }
1068
1069     /* Make accesses to pages with watchpoints go via the
1070        watchpoint trap routines.  */
1071     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
1072         if (cpu_watchpoint_address_matches(wp, vaddr, TARGET_PAGE_SIZE)) {
1073             /* Avoid trapping reads of pages with a write breakpoint. */
1074             if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) {
1075                 iotlb = PHYS_SECTION_WATCH + paddr;
1076                 *address |= TLB_MMIO;
1077                 break;
1078             }
1079         }
1080     }
1081
1082     return iotlb;
1083 }
1084 #endif /* defined(CONFIG_USER_ONLY) */
1085
1086 #if !defined(CONFIG_USER_ONLY)
1087
1088 static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
1089                              uint16_t section);
1090 static subpage_t *subpage_init(AddressSpace *as, hwaddr base);
1091
1092 static void *(*phys_mem_alloc)(size_t size, uint64_t *align) =
1093                                qemu_anon_ram_alloc;
1094
1095 /*
1096  * Set a custom physical guest memory alloator.
1097  * Accelerators with unusual needs may need this.  Hopefully, we can
1098  * get rid of it eventually.
1099  */
1100 void phys_mem_set_alloc(void *(*alloc)(size_t, uint64_t *align))
1101 {
1102     phys_mem_alloc = alloc;
1103 }
1104
1105 static uint16_t phys_section_add(PhysPageMap *map,
1106                                  MemoryRegionSection *section)
1107 {
1108     /* The physical section number is ORed with a page-aligned
1109      * pointer to produce the iotlb entries.  Thus it should
1110      * never overflow into the page-aligned value.
1111      */
1112     assert(map->sections_nb < TARGET_PAGE_SIZE);
1113
1114     if (map->sections_nb == map->sections_nb_alloc) {
1115         map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
1116         map->sections = g_renew(MemoryRegionSection, map->sections,
1117                                 map->sections_nb_alloc);
1118     }
1119     map->sections[map->sections_nb] = *section;
1120     memory_region_ref(section->mr);
1121     return map->sections_nb++;
1122 }
1123
1124 static void phys_section_destroy(MemoryRegion *mr)
1125 {
1126     bool have_sub_page = mr->subpage;
1127
1128     memory_region_unref(mr);
1129
1130     if (have_sub_page) {
1131         subpage_t *subpage = container_of(mr, subpage_t, iomem);
1132         object_unref(OBJECT(&subpage->iomem));
1133         g_free(subpage);
1134     }
1135 }
1136
1137 static void phys_sections_free(PhysPageMap *map)
1138 {
1139     while (map->sections_nb > 0) {
1140         MemoryRegionSection *section = &map->sections[--map->sections_nb];
1141         phys_section_destroy(section->mr);
1142     }
1143     g_free(map->sections);
1144     g_free(map->nodes);
1145 }
1146
1147 static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *section)
1148 {
1149     subpage_t *subpage;
1150     hwaddr base = section->offset_within_address_space
1151         & TARGET_PAGE_MASK;
1152     MemoryRegionSection *existing = phys_page_find(d->phys_map, base,
1153                                                    d->map.nodes, d->map.sections);
1154     MemoryRegionSection subsection = {
1155         .offset_within_address_space = base,
1156         .size = int128_make64(TARGET_PAGE_SIZE),
1157     };
1158     hwaddr start, end;
1159
1160     assert(existing->mr->subpage || existing->mr == &io_mem_unassigned);
1161
1162     if (!(existing->mr->subpage)) {
1163         subpage = subpage_init(d->as, base);
1164         subsection.address_space = d->as;
1165         subsection.mr = &subpage->iomem;
1166         phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
1167                       phys_section_add(&d->map, &subsection));
1168     } else {
1169         subpage = container_of(existing->mr, subpage_t, iomem);
1170     }
1171     start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
1172     end = start + int128_get64(section->size) - 1;
1173     subpage_register(subpage, start, end,
1174                      phys_section_add(&d->map, section));
1175 }
1176
1177
1178 static void register_multipage(AddressSpaceDispatch *d,
1179                                MemoryRegionSection *section)
1180 {
1181     hwaddr start_addr = section->offset_within_address_space;
1182     uint16_t section_index = phys_section_add(&d->map, section);
1183     uint64_t num_pages = int128_get64(int128_rshift(section->size,
1184                                                     TARGET_PAGE_BITS));
1185
1186     assert(num_pages);
1187     phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
1188 }
1189
1190 static void mem_add(MemoryListener *listener, MemoryRegionSection *section)
1191 {
1192     AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
1193     AddressSpaceDispatch *d = as->next_dispatch;
1194     MemoryRegionSection now = *section, remain = *section;
1195     Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
1196
1197     if (now.offset_within_address_space & ~TARGET_PAGE_MASK) {
1198         uint64_t left = TARGET_PAGE_ALIGN(now.offset_within_address_space)
1199                        - now.offset_within_address_space;
1200
1201         now.size = int128_min(int128_make64(left), now.size);
1202         register_subpage(d, &now);
1203     } else {
1204         now.size = int128_zero();
1205     }
1206     while (int128_ne(remain.size, now.size)) {
1207         remain.size = int128_sub(remain.size, now.size);
1208         remain.offset_within_address_space += int128_get64(now.size);
1209         remain.offset_within_region += int128_get64(now.size);
1210         now = remain;
1211         if (int128_lt(remain.size, page_size)) {
1212             register_subpage(d, &now);
1213         } else if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
1214             now.size = page_size;
1215             register_subpage(d, &now);
1216         } else {
1217             now.size = int128_and(now.size, int128_neg(page_size));
1218             register_multipage(d, &now);
1219         }
1220     }
1221 }
1222
1223 void qemu_flush_coalesced_mmio_buffer(void)
1224 {
1225     if (kvm_enabled())
1226         kvm_flush_coalesced_mmio_buffer();
1227 }
1228
1229 void qemu_mutex_lock_ramlist(void)
1230 {
1231     qemu_mutex_lock(&ram_list.mutex);
1232 }
1233
1234 void qemu_mutex_unlock_ramlist(void)
1235 {
1236     qemu_mutex_unlock(&ram_list.mutex);
1237 }
1238
1239 #ifdef __linux__
1240 static int64_t get_file_size(int fd)
1241 {
1242     int64_t size = lseek(fd, 0, SEEK_END);
1243     if (size < 0) {
1244         return -errno;
1245     }
1246     return size;
1247 }
1248
1249 static void *file_ram_alloc(RAMBlock *block,
1250                             ram_addr_t memory,
1251                             const char *path,
1252                             Error **errp)
1253 {
1254     bool unlink_on_error = false;
1255     char *filename;
1256     char *sanitized_name;
1257     char *c;
1258     void * volatile area = MAP_FAILED;
1259     int fd = -1;
1260     int64_t file_size;
1261
1262     if (kvm_enabled() && !kvm_has_sync_mmu()) {
1263         error_setg(errp,
1264                    "host lacks kvm mmu notifiers, -mem-path unsupported");
1265         return NULL;
1266     }
1267
1268     for (;;) {
1269         fd = open(path, O_RDWR);
1270         if (fd >= 0) {
1271             /* @path names an existing file, use it */
1272             break;
1273         }
1274         if (errno == ENOENT) {
1275             /* @path names a file that doesn't exist, create it */
1276             fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644);
1277             if (fd >= 0) {
1278                 unlink_on_error = true;
1279                 break;
1280             }
1281         } else if (errno == EISDIR) {
1282             /* @path names a directory, create a file there */
1283             /* Make name safe to use with mkstemp by replacing '/' with '_'. */
1284             sanitized_name = g_strdup(memory_region_name(block->mr));
1285             for (c = sanitized_name; *c != '\0'; c++) {
1286                 if (*c == '/') {
1287                     *c = '_';
1288                 }
1289             }
1290
1291             filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
1292                                        sanitized_name);
1293             g_free(sanitized_name);
1294
1295             fd = mkstemp(filename);
1296             if (fd >= 0) {
1297                 unlink(filename);
1298                 g_free(filename);
1299                 break;
1300             }
1301             g_free(filename);
1302         }
1303         if (errno != EEXIST && errno != EINTR) {
1304             error_setg_errno(errp, errno,
1305                              "can't open backing store %s for guest RAM",
1306                              path);
1307             goto error;
1308         }
1309         /*
1310          * Try again on EINTR and EEXIST.  The latter happens when
1311          * something else creates the file between our two open().
1312          */
1313     }
1314
1315     block->page_size = qemu_fd_getpagesize(fd);
1316     block->mr->align = block->page_size;
1317 #if defined(__s390x__)
1318     if (kvm_enabled()) {
1319         block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
1320     }
1321 #endif
1322
1323     file_size = get_file_size(fd);
1324
1325     if (memory < block->page_size) {
1326         error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
1327                    "or larger than page size 0x%zx",
1328                    memory, block->page_size);
1329         goto error;
1330     }
1331
1332     if (file_size > 0 && file_size < memory) {
1333         error_setg(errp, "backing store %s size 0x%" PRIx64
1334                    " does not match 'size' option 0x" RAM_ADDR_FMT,
1335                    path, file_size, memory);
1336         goto error;
1337     }
1338
1339     memory = ROUND_UP(memory, block->page_size);
1340
1341     /*
1342      * ftruncate is not supported by hugetlbfs in older
1343      * hosts, so don't bother bailing out on errors.
1344      * If anything goes wrong with it under other filesystems,
1345      * mmap will fail.
1346      *
1347      * Do not truncate the non-empty backend file to avoid corrupting
1348      * the existing data in the file. Disabling shrinking is not
1349      * enough. For example, the current vNVDIMM implementation stores
1350      * the guest NVDIMM labels at the end of the backend file. If the
1351      * backend file is later extended, QEMU will not be able to find
1352      * those labels. Therefore, extending the non-empty backend file
1353      * is disabled as well.
1354      */
1355     if (!file_size && ftruncate(fd, memory)) {
1356         perror("ftruncate");
1357     }
1358
1359     area = qemu_ram_mmap(fd, memory, block->mr->align,
1360                          block->flags & RAM_SHARED);
1361     if (area == MAP_FAILED) {
1362         error_setg_errno(errp, errno,
1363                          "unable to map backing store for guest RAM");
1364         goto error;
1365     }
1366
1367     if (mem_prealloc) {
1368         os_mem_prealloc(fd, area, memory, errp);
1369         if (errp && *errp) {
1370             goto error;
1371         }
1372     }
1373
1374     block->fd = fd;
1375     return area;
1376
1377 error:
1378     if (area != MAP_FAILED) {
1379         qemu_ram_munmap(area, memory);
1380     }
1381     if (unlink_on_error) {
1382         unlink(path);
1383     }
1384     if (fd != -1) {
1385         close(fd);
1386     }
1387     return NULL;
1388 }
1389 #endif
1390
1391 /* Called with the ramlist lock held.  */
1392 static ram_addr_t find_ram_offset(ram_addr_t size)
1393 {
1394     RAMBlock *block, *next_block;
1395     ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
1396
1397     assert(size != 0); /* it would hand out same offset multiple times */
1398
1399     if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
1400         return 0;
1401     }
1402
1403     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1404         ram_addr_t end, next = RAM_ADDR_MAX;
1405
1406         end = block->offset + block->max_length;
1407
1408         QLIST_FOREACH_RCU(next_block, &ram_list.blocks, next) {
1409             if (next_block->offset >= end) {
1410                 next = MIN(next, next_block->offset);
1411             }
1412         }
1413         if (next - end >= size && next - end < mingap) {
1414             offset = end;
1415             mingap = next - end;
1416         }
1417     }
1418
1419     if (offset == RAM_ADDR_MAX) {
1420         fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
1421                 (uint64_t)size);
1422         abort();
1423     }
1424
1425     return offset;
1426 }
1427
1428 ram_addr_t last_ram_offset(void)
1429 {
1430     RAMBlock *block;
1431     ram_addr_t last = 0;
1432
1433     rcu_read_lock();
1434     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1435         last = MAX(last, block->offset + block->max_length);
1436     }
1437     rcu_read_unlock();
1438     return last;
1439 }
1440
1441 static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
1442 {
1443     int ret;
1444
1445     /* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
1446     if (!machine_dump_guest_core(current_machine)) {
1447         ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
1448         if (ret) {
1449             perror("qemu_madvise");
1450             fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
1451                             "but dump_guest_core=off specified\n");
1452         }
1453     }
1454 }
1455
1456 const char *qemu_ram_get_idstr(RAMBlock *rb)
1457 {
1458     return rb->idstr;
1459 }
1460
1461 /* Called with iothread lock held.  */
1462 void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev)
1463 {
1464     RAMBlock *block;
1465
1466     assert(new_block);
1467     assert(!new_block->idstr[0]);
1468
1469     if (dev) {
1470         char *id = qdev_get_dev_path(dev);
1471         if (id) {
1472             snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
1473             g_free(id);
1474         }
1475     }
1476     pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
1477
1478     rcu_read_lock();
1479     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1480         if (block != new_block &&
1481             !strcmp(block->idstr, new_block->idstr)) {
1482             fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
1483                     new_block->idstr);
1484             abort();
1485         }
1486     }
1487     rcu_read_unlock();
1488 }
1489
1490 /* Called with iothread lock held.  */
1491 void qemu_ram_unset_idstr(RAMBlock *block)
1492 {
1493     /* FIXME: arch_init.c assumes that this is not called throughout
1494      * migration.  Ignore the problem since hot-unplug during migration
1495      * does not work anyway.
1496      */
1497     if (block) {
1498         memset(block->idstr, 0, sizeof(block->idstr));
1499     }
1500 }
1501
1502 size_t qemu_ram_pagesize(RAMBlock *rb)
1503 {
1504     return rb->page_size;
1505 }
1506
1507 static int memory_try_enable_merging(void *addr, size_t len)
1508 {
1509     if (!machine_mem_merge(current_machine)) {
1510         /* disabled by the user */
1511         return 0;
1512     }
1513
1514     return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
1515 }
1516
1517 /* Only legal before guest might have detected the memory size: e.g. on
1518  * incoming migration, or right after reset.
1519  *
1520  * As memory core doesn't know how is memory accessed, it is up to
1521  * resize callback to update device state and/or add assertions to detect
1522  * misuse, if necessary.
1523  */
1524 int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp)
1525 {
1526     assert(block);
1527
1528     newsize = HOST_PAGE_ALIGN(newsize);
1529
1530     if (block->used_length == newsize) {
1531         return 0;
1532     }
1533
1534     if (!(block->flags & RAM_RESIZEABLE)) {
1535         error_setg_errno(errp, EINVAL,
1536                          "Length mismatch: %s: 0x" RAM_ADDR_FMT
1537                          " in != 0x" RAM_ADDR_FMT, block->idstr,
1538                          newsize, block->used_length);
1539         return -EINVAL;
1540     }
1541
1542     if (block->max_length < newsize) {
1543         error_setg_errno(errp, EINVAL,
1544                          "Length too large: %s: 0x" RAM_ADDR_FMT
1545                          " > 0x" RAM_ADDR_FMT, block->idstr,
1546                          newsize, block->max_length);
1547         return -EINVAL;
1548     }
1549
1550     cpu_physical_memory_clear_dirty_range(block->offset, block->used_length);
1551     block->used_length = newsize;
1552     cpu_physical_memory_set_dirty_range(block->offset, block->used_length,
1553                                         DIRTY_CLIENTS_ALL);
1554     memory_region_set_size(block->mr, newsize);
1555     if (block->resized) {
1556         block->resized(block->idstr, newsize, block->host);
1557     }
1558     return 0;
1559 }
1560
1561 /* Called with ram_list.mutex held */
1562 static void dirty_memory_extend(ram_addr_t old_ram_size,
1563                                 ram_addr_t new_ram_size)
1564 {
1565     ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
1566                                              DIRTY_MEMORY_BLOCK_SIZE);
1567     ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
1568                                              DIRTY_MEMORY_BLOCK_SIZE);
1569     int i;
1570
1571     /* Only need to extend if block count increased */
1572     if (new_num_blocks <= old_num_blocks) {
1573         return;
1574     }
1575
1576     for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
1577         DirtyMemoryBlocks *old_blocks;
1578         DirtyMemoryBlocks *new_blocks;
1579         int j;
1580
1581         old_blocks = atomic_rcu_read(&ram_list.dirty_memory[i]);
1582         new_blocks = g_malloc(sizeof(*new_blocks) +
1583                               sizeof(new_blocks->blocks[0]) * new_num_blocks);
1584
1585         if (old_num_blocks) {
1586             memcpy(new_blocks->blocks, old_blocks->blocks,
1587                    old_num_blocks * sizeof(old_blocks->blocks[0]));
1588         }
1589
1590         for (j = old_num_blocks; j < new_num_blocks; j++) {
1591             new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
1592         }
1593
1594         atomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);
1595
1596         if (old_blocks) {
1597             g_free_rcu(old_blocks, rcu);
1598         }
1599     }
1600 }
1601
1602 static void ram_block_add(RAMBlock *new_block, Error **errp)
1603 {
1604     RAMBlock *block;
1605     RAMBlock *last_block = NULL;
1606     ram_addr_t old_ram_size, new_ram_size;
1607     Error *err = NULL;
1608
1609     old_ram_size = last_ram_offset() >> TARGET_PAGE_BITS;
1610
1611     qemu_mutex_lock_ramlist();
1612     new_block->offset = find_ram_offset(new_block->max_length);
1613
1614     if (!new_block->host) {
1615         if (xen_enabled()) {
1616             xen_ram_alloc(new_block->offset, new_block->max_length,
1617                           new_block->mr, &err);
1618             if (err) {
1619                 error_propagate(errp, err);
1620                 qemu_mutex_unlock_ramlist();
1621                 return;
1622             }
1623         } else {
1624             new_block->host = phys_mem_alloc(new_block->max_length,
1625                                              &new_block->mr->align);
1626             /*
1627              * In Hax, the qemu allocate the virtual address, and HAX kernel
1628              * populate the memory with physical memory. Currently we have no
1629              * paging, so user should make sure enough free memory in advance
1630              */
1631             if (hax_enabled()) {
1632                 int ret;
1633                 ret = hax_populate_ram((uint64_t)(uintptr_t)new_block->host,
1634                                        new_block->max_length);
1635                 if (ret < 0) {
1636                     error_setg(errp, "Hax failed to populate ram");
1637                     return;
1638                 }
1639             }
1640
1641             if (!new_block->host) {
1642                 error_setg_errno(errp, errno,
1643                                  "cannot set up guest memory '%s'",
1644                                  memory_region_name(new_block->mr));
1645                 qemu_mutex_unlock_ramlist();
1646                 return;
1647             }
1648             memory_try_enable_merging(new_block->host, new_block->max_length);
1649         }
1650     }
1651
1652     new_ram_size = MAX(old_ram_size,
1653               (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
1654     if (new_ram_size > old_ram_size) {
1655         migration_bitmap_extend(old_ram_size, new_ram_size);
1656         dirty_memory_extend(old_ram_size, new_ram_size);
1657     }
1658     /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
1659      * QLIST (which has an RCU-friendly variant) does not have insertion at
1660      * tail, so save the last element in last_block.
1661      */
1662     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1663         last_block = block;
1664         if (block->max_length < new_block->max_length) {
1665             break;
1666         }
1667     }
1668     if (block) {
1669         QLIST_INSERT_BEFORE_RCU(block, new_block, next);
1670     } else if (last_block) {
1671         QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
1672     } else { /* list is empty */
1673         QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
1674     }
1675     ram_list.mru_block = NULL;
1676
1677     /* Write list before version */
1678     smp_wmb();
1679     ram_list.version++;
1680     qemu_mutex_unlock_ramlist();
1681
1682     cpu_physical_memory_set_dirty_range(new_block->offset,
1683                                         new_block->used_length,
1684                                         DIRTY_CLIENTS_ALL);
1685
1686     if (new_block->host) {
1687         qemu_ram_setup_dump(new_block->host, new_block->max_length);
1688         qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
1689         /* MADV_DONTFORK is also needed by KVM in absence of synchronous MMU */
1690         qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_DONTFORK);
1691     }
1692 }
1693
1694 #ifdef __linux__
1695 RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
1696                                    bool share, const char *mem_path,
1697                                    Error **errp)
1698 {
1699     RAMBlock *new_block;
1700     Error *local_err = NULL;
1701
1702     if (xen_enabled()) {
1703         error_setg(errp, "-mem-path not supported with Xen");
1704         return NULL;
1705     }
1706
1707     if (phys_mem_alloc != qemu_anon_ram_alloc) {
1708         /*
1709          * file_ram_alloc() needs to allocate just like
1710          * phys_mem_alloc, but we haven't bothered to provide
1711          * a hook there.
1712          */
1713         error_setg(errp,
1714                    "-mem-path not supported with this accelerator");
1715         return NULL;
1716     }
1717
1718     size = HOST_PAGE_ALIGN(size);
1719     new_block = g_malloc0(sizeof(*new_block));
1720     new_block->mr = mr;
1721     new_block->used_length = size;
1722     new_block->max_length = size;
1723     new_block->flags = share ? RAM_SHARED : 0;
1724     new_block->host = file_ram_alloc(new_block, size,
1725                                      mem_path, errp);
1726     if (!new_block->host) {
1727         g_free(new_block);
1728         return NULL;
1729     }
1730
1731     ram_block_add(new_block, &local_err);
1732     if (local_err) {
1733         g_free(new_block);
1734         error_propagate(errp, local_err);
1735         return NULL;
1736     }
1737     return new_block;
1738 }
1739 #endif
1740
1741 static
1742 RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
1743                                   void (*resized)(const char*,
1744                                                   uint64_t length,
1745                                                   void *host),
1746                                   void *host, bool resizeable,
1747                                   MemoryRegion *mr, Error **errp)
1748 {
1749     RAMBlock *new_block;
1750     Error *local_err = NULL;
1751
1752     size = HOST_PAGE_ALIGN(size);
1753     max_size = HOST_PAGE_ALIGN(max_size);
1754     new_block = g_malloc0(sizeof(*new_block));
1755     new_block->mr = mr;
1756     new_block->resized = resized;
1757     new_block->used_length = size;
1758     new_block->max_length = max_size;
1759     assert(max_size >= size);
1760     new_block->fd = -1;
1761     new_block->page_size = getpagesize();
1762     new_block->host = host;
1763     if (host) {
1764         new_block->flags |= RAM_PREALLOC;
1765     }
1766     if (resizeable) {
1767         new_block->flags |= RAM_RESIZEABLE;
1768     }
1769     ram_block_add(new_block, &local_err);
1770     if (local_err) {
1771         g_free(new_block);
1772         error_propagate(errp, local_err);
1773         return NULL;
1774     }
1775     return new_block;
1776 }
1777
1778 RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
1779                                    MemoryRegion *mr, Error **errp)
1780 {
1781     return qemu_ram_alloc_internal(size, size, NULL, host, false, mr, errp);
1782 }
1783
1784 RAMBlock *qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr, Error **errp)
1785 {
1786     return qemu_ram_alloc_internal(size, size, NULL, NULL, false, mr, errp);
1787 }
1788
1789 RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
1790                                      void (*resized)(const char*,
1791                                                      uint64_t length,
1792                                                      void *host),
1793                                      MemoryRegion *mr, Error **errp)
1794 {
1795     return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true, mr, errp);
1796 }
1797
1798 static void reclaim_ramblock(RAMBlock *block)
1799 {
1800     if (block->flags & RAM_PREALLOC) {
1801         ;
1802     } else if (xen_enabled()) {
1803         xen_invalidate_map_cache_entry(block->host);
1804 #ifndef _WIN32
1805     } else if (block->fd >= 0) {
1806         qemu_ram_munmap(block->host, block->max_length);
1807         close(block->fd);
1808 #endif
1809     } else {
1810         qemu_anon_ram_free(block->host, block->max_length);
1811     }
1812     g_free(block);
1813 }
1814
1815 void qemu_ram_free(RAMBlock *block)
1816 {
1817     if (!block) {
1818         return;
1819     }
1820
1821     qemu_mutex_lock_ramlist();
1822     QLIST_REMOVE_RCU(block, next);
1823     ram_list.mru_block = NULL;
1824     /* Write list before version */
1825     smp_wmb();
1826     ram_list.version++;
1827     call_rcu(block, reclaim_ramblock, rcu);
1828     qemu_mutex_unlock_ramlist();
1829 }
1830
1831 #ifndef _WIN32
1832 void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
1833 {
1834     RAMBlock *block;
1835     ram_addr_t offset;
1836     int flags;
1837     void *area, *vaddr;
1838
1839     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1840         offset = addr - block->offset;
1841         if (offset < block->max_length) {
1842             vaddr = ramblock_ptr(block, offset);
1843             if (block->flags & RAM_PREALLOC) {
1844                 ;
1845             } else if (xen_enabled()) {
1846                 abort();
1847             } else {
1848                 flags = MAP_FIXED;
1849                 if (block->fd >= 0) {
1850                     flags |= (block->flags & RAM_SHARED ?
1851                               MAP_SHARED : MAP_PRIVATE);
1852                     area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
1853                                 flags, block->fd, offset);
1854                 } else {
1855                     /*
1856                      * Remap needs to match alloc.  Accelerators that
1857                      * set phys_mem_alloc never remap.  If they did,
1858                      * we'd need a remap hook here.
1859                      */
1860                     assert(phys_mem_alloc == qemu_anon_ram_alloc);
1861
1862                     flags |= MAP_PRIVATE | MAP_ANONYMOUS;
1863                     area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
1864                                 flags, -1, 0);
1865                 }
1866                 if (area != vaddr) {
1867                     fprintf(stderr, "Could not remap addr: "
1868                             RAM_ADDR_FMT "@" RAM_ADDR_FMT "\n",
1869                             length, addr);
1870                     exit(1);
1871                 }
1872                 memory_try_enable_merging(vaddr, length);
1873                 qemu_ram_setup_dump(vaddr, length);
1874             }
1875         }
1876     }
1877 }
1878 #endif /* !_WIN32 */
1879
1880 /* Return a host pointer to ram allocated with qemu_ram_alloc.
1881  * This should not be used for general purpose DMA.  Use address_space_map
1882  * or address_space_rw instead. For local memory (e.g. video ram) that the
1883  * device owns, use memory_region_get_ram_ptr.
1884  *
1885  * Called within RCU critical section.
1886  */
1887 void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)
1888 {
1889     RAMBlock *block = ram_block;
1890
1891     if (block == NULL) {
1892         block = qemu_get_ram_block(addr);
1893         addr -= block->offset;
1894     }
1895
1896     if (xen_enabled() && block->host == NULL) {
1897         /* We need to check if the requested address is in the RAM
1898          * because we don't want to map the entire memory in QEMU.
1899          * In that case just map until the end of the page.
1900          */
1901         if (block->offset == 0) {
1902             return xen_map_cache(addr, 0, 0);
1903         }
1904
1905         block->host = xen_map_cache(block->offset, block->max_length, 1);
1906     }
1907     return ramblock_ptr(block, addr);
1908 }
1909
1910 /* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr
1911  * but takes a size argument.
1912  *
1913  * Called within RCU critical section.
1914  */
1915 static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
1916                                  hwaddr *size)
1917 {
1918     RAMBlock *block = ram_block;
1919     if (*size == 0) {
1920         return NULL;
1921     }
1922
1923     if (block == NULL) {
1924         block = qemu_get_ram_block(addr);
1925         addr -= block->offset;
1926     }
1927     *size = MIN(*size, block->max_length - addr);
1928
1929     if (xen_enabled() && block->host == NULL) {
1930         /* We need to check if the requested address is in the RAM
1931          * because we don't want to map the entire memory in QEMU.
1932          * In that case just map the requested area.
1933          */
1934         if (block->offset == 0) {
1935             return xen_map_cache(addr, *size, 1);
1936         }
1937
1938         block->host = xen_map_cache(block->offset, block->max_length, 1);
1939     }
1940
1941     return ramblock_ptr(block, addr);
1942 }
1943
1944 /*
1945  * Translates a host ptr back to a RAMBlock, a ram_addr and an offset
1946  * in that RAMBlock.
1947  *
1948  * ptr: Host pointer to look up
1949  * round_offset: If true round the result offset down to a page boundary
1950  * *ram_addr: set to result ram_addr
1951  * *offset: set to result offset within the RAMBlock
1952  *
1953  * Returns: RAMBlock (or NULL if not found)
1954  *
1955  * By the time this function returns, the returned pointer is not protected
1956  * by RCU anymore.  If the caller is not within an RCU critical section and
1957  * does not hold the iothread lock, it must have other means of protecting the
1958  * pointer, such as a reference to the region that includes the incoming
1959  * ram_addr_t.
1960  */
1961 RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
1962                                    ram_addr_t *offset)
1963 {
1964     RAMBlock *block;
1965     uint8_t *host = ptr;
1966
1967     if (xen_enabled()) {
1968         ram_addr_t ram_addr;
1969         rcu_read_lock();
1970         ram_addr = xen_ram_addr_from_mapcache(ptr);
1971         block = qemu_get_ram_block(ram_addr);
1972         if (block) {
1973             *offset = ram_addr - block->offset;
1974         }
1975         rcu_read_unlock();
1976         return block;
1977     }
1978
1979     rcu_read_lock();
1980     block = atomic_rcu_read(&ram_list.mru_block);
1981     if (block && block->host && host - block->host < block->max_length) {
1982         goto found;
1983     }
1984
1985     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1986         /* This case append when the block is not mapped. */
1987         if (block->host == NULL) {
1988             continue;
1989         }
1990         if (host - block->host < block->max_length) {
1991             goto found;
1992         }
1993     }
1994
1995     rcu_read_unlock();
1996     return NULL;
1997
1998 found:
1999     *offset = (host - block->host);
2000     if (round_offset) {
2001         *offset &= TARGET_PAGE_MASK;
2002     }
2003     rcu_read_unlock();
2004     return block;
2005 }
2006
2007 /*
2008  * Finds the named RAMBlock
2009  *
2010  * name: The name of RAMBlock to find
2011  *
2012  * Returns: RAMBlock (or NULL if not found)
2013  */
2014 RAMBlock *qemu_ram_block_by_name(const char *name)
2015 {
2016     RAMBlock *block;
2017
2018     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2019         if (!strcmp(name, block->idstr)) {
2020             return block;
2021         }
2022     }
2023
2024     return NULL;
2025 }
2026
2027 /* Some of the softmmu routines need to translate from a host pointer
2028    (typically a TLB entry) back to a ram offset.  */
2029 ram_addr_t qemu_ram_addr_from_host(void *ptr)
2030 {
2031     RAMBlock *block;
2032     ram_addr_t offset;
2033
2034     block = qemu_ram_block_from_host(ptr, false, &offset);
2035     if (!block) {
2036         return RAM_ADDR_INVALID;
2037     }
2038
2039     return block->offset + offset;
2040 }
2041
2042 /* Called within RCU critical section.  */
2043 static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
2044                                uint64_t val, unsigned size)
2045 {
2046     bool locked = false;
2047
2048     if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
2049         locked = true;
2050         tb_lock();
2051         tb_invalidate_phys_page_fast(ram_addr, size);
2052     }
2053     switch (size) {
2054     case 1:
2055         stb_p(qemu_map_ram_ptr(NULL, ram_addr), val);
2056         break;
2057     case 2:
2058         stw_p(qemu_map_ram_ptr(NULL, ram_addr), val);
2059         break;
2060     case 4:
2061         stl_p(qemu_map_ram_ptr(NULL, ram_addr), val);
2062         break;
2063     default:
2064         abort();
2065     }
2066
2067     if (locked) {
2068         tb_unlock();
2069     }
2070
2071     /* Set both VGA and migration bits for simplicity and to remove
2072      * the notdirty callback faster.
2073      */
2074     cpu_physical_memory_set_dirty_range(ram_addr, size,
2075                                         DIRTY_CLIENTS_NOCODE);
2076     /* we remove the notdirty callback only if the code has been
2077        flushed */
2078     if (!cpu_physical_memory_is_clean(ram_addr)) {
2079         tlb_set_dirty(current_cpu, current_cpu->mem_io_vaddr);
2080     }
2081 }
2082
2083 static bool notdirty_mem_accepts(void *opaque, hwaddr addr,
2084                                  unsigned size, bool is_write)
2085 {
2086     return is_write;
2087 }
2088
2089 static const MemoryRegionOps notdirty_mem_ops = {
2090     .write = notdirty_mem_write,
2091     .valid.accepts = notdirty_mem_accepts,
2092     .endianness = DEVICE_NATIVE_ENDIAN,
2093 };
2094
2095 /* Generate a debug exception if a watchpoint has been hit.  */
2096 static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags)
2097 {
2098     CPUState *cpu = current_cpu;
2099     CPUClass *cc = CPU_GET_CLASS(cpu);
2100     CPUArchState *env = cpu->env_ptr;
2101     target_ulong pc, cs_base;
2102     target_ulong vaddr;
2103     CPUWatchpoint *wp;
2104     uint32_t cpu_flags;
2105
2106     if (cpu->watchpoint_hit) {
2107         /* We re-entered the check after replacing the TB. Now raise
2108          * the debug interrupt so that is will trigger after the
2109          * current instruction. */
2110         cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
2111         return;
2112     }
2113     vaddr = (cpu->mem_io_vaddr & TARGET_PAGE_MASK) + offset;
2114     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
2115         if (cpu_watchpoint_address_matches(wp, vaddr, len)
2116             && (wp->flags & flags)) {
2117             if (flags == BP_MEM_READ) {
2118                 wp->flags |= BP_WATCHPOINT_HIT_READ;
2119             } else {
2120                 wp->flags |= BP_WATCHPOINT_HIT_WRITE;
2121             }
2122             wp->hitaddr = vaddr;
2123             wp->hitattrs = attrs;
2124             if (!cpu->watchpoint_hit) {
2125                 if (wp->flags & BP_CPU &&
2126                     !cc->debug_check_watchpoint(cpu, wp)) {
2127                     wp->flags &= ~BP_WATCHPOINT_HIT;
2128                     continue;
2129                 }
2130                 cpu->watchpoint_hit = wp;
2131
2132                 /* The tb_lock will be reset when cpu_loop_exit or
2133                  * cpu_loop_exit_noexc longjmp back into the cpu_exec
2134                  * main loop.
2135                  */
2136                 tb_lock();
2137                 tb_check_watchpoint(cpu);
2138                 if (wp->flags & BP_STOP_BEFORE_ACCESS) {
2139                     cpu->exception_index = EXCP_DEBUG;
2140                     cpu_loop_exit(cpu);
2141                 } else {
2142                     cpu_get_tb_cpu_state(env, &pc, &cs_base, &cpu_flags);
2143                     tb_gen_code(cpu, pc, cs_base, cpu_flags, 1);
2144                     cpu_loop_exit_noexc(cpu);
2145                 }
2146             }
2147         } else {
2148             wp->flags &= ~BP_WATCHPOINT_HIT;
2149         }
2150     }
2151 }
2152
2153 /* Watchpoint access routines.  Watchpoints are inserted using TLB tricks,
2154    so these check for a hit then pass through to the normal out-of-line
2155    phys routines.  */
2156 static MemTxResult watch_mem_read(void *opaque, hwaddr addr, uint64_t *pdata,
2157                                   unsigned size, MemTxAttrs attrs)
2158 {
2159     MemTxResult res;
2160     uint64_t data;
2161     int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
2162     AddressSpace *as = current_cpu->cpu_ases[asidx].as;
2163
2164     check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_READ);
2165     switch (size) {
2166     case 1:
2167         data = address_space_ldub(as, addr, attrs, &res);
2168         break;
2169     case 2:
2170         data = address_space_lduw(as, addr, attrs, &res);
2171         break;
2172     case 4:
2173         data = address_space_ldl(as, addr, attrs, &res);
2174         break;
2175     default: abort();
2176     }
2177     *pdata = data;
2178     return res;
2179 }
2180
2181 static MemTxResult watch_mem_write(void *opaque, hwaddr addr,
2182                                    uint64_t val, unsigned size,
2183                                    MemTxAttrs attrs)
2184 {
2185     MemTxResult res;
2186     int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
2187     AddressSpace *as = current_cpu->cpu_ases[asidx].as;
2188
2189     check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_WRITE);
2190     switch (size) {
2191     case 1:
2192         address_space_stb(as, addr, val, attrs, &res);
2193         break;
2194     case 2:
2195         address_space_stw(as, addr, val, attrs, &res);
2196         break;
2197     case 4:
2198         address_space_stl(as, addr, val, attrs, &res);
2199         break;
2200     default: abort();
2201     }
2202     return res;
2203 }
2204
2205 static const MemoryRegionOps watch_mem_ops = {
2206     .read_with_attrs = watch_mem_read,
2207     .write_with_attrs = watch_mem_write,
2208     .endianness = DEVICE_NATIVE_ENDIAN,
2209 };
2210
2211 static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data,
2212                                 unsigned len, MemTxAttrs attrs)
2213 {
2214     subpage_t *subpage = opaque;
2215     uint8_t buf[8];
2216     MemTxResult res;
2217
2218 #if defined(DEBUG_SUBPAGE)
2219     printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__,
2220            subpage, len, addr);
2221 #endif
2222     res = address_space_read(subpage->as, addr + subpage->base,
2223                              attrs, buf, len);
2224     if (res) {
2225         return res;
2226     }
2227     switch (len) {
2228     case 1:
2229         *data = ldub_p(buf);
2230         return MEMTX_OK;
2231     case 2:
2232         *data = lduw_p(buf);
2233         return MEMTX_OK;
2234     case 4:
2235         *data = ldl_p(buf);
2236         return MEMTX_OK;
2237     case 8:
2238         *data = ldq_p(buf);
2239         return MEMTX_OK;
2240     default:
2241         abort();
2242     }
2243 }
2244
2245 static MemTxResult subpage_write(void *opaque, hwaddr addr,
2246                                  uint64_t value, unsigned len, MemTxAttrs attrs)
2247 {
2248     subpage_t *subpage = opaque;
2249     uint8_t buf[8];
2250
2251 #if defined(DEBUG_SUBPAGE)
2252     printf("%s: subpage %p len %u addr " TARGET_FMT_plx
2253            " value %"PRIx64"\n",
2254            __func__, subpage, len, addr, value);
2255 #endif
2256     switch (len) {
2257     case 1:
2258         stb_p(buf, value);
2259         break;
2260     case 2:
2261         stw_p(buf, value);
2262         break;
2263     case 4:
2264         stl_p(buf, value);
2265         break;
2266     case 8:
2267         stq_p(buf, value);
2268         break;
2269     default:
2270         abort();
2271     }
2272     return address_space_write(subpage->as, addr + subpage->base,
2273                                attrs, buf, len);
2274 }
2275
2276 static bool subpage_accepts(void *opaque, hwaddr addr,
2277                             unsigned len, bool is_write)
2278 {
2279     subpage_t *subpage = opaque;
2280 #if defined(DEBUG_SUBPAGE)
2281     printf("%s: subpage %p %c len %u addr " TARGET_FMT_plx "\n",
2282            __func__, subpage, is_write ? 'w' : 'r', len, addr);
2283 #endif
2284
2285     return address_space_access_valid(subpage->as, addr + subpage->base,
2286                                       len, is_write);
2287 }
2288
2289 static const MemoryRegionOps subpage_ops = {
2290     .read_with_attrs = subpage_read,
2291     .write_with_attrs = subpage_write,
2292     .impl.min_access_size = 1,
2293     .impl.max_access_size = 8,
2294     .valid.min_access_size = 1,
2295     .valid.max_access_size = 8,
2296     .valid.accepts = subpage_accepts,
2297     .endianness = DEVICE_NATIVE_ENDIAN,
2298 };
2299
2300 static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
2301                              uint16_t section)
2302 {
2303     int idx, eidx;
2304
2305     if (start >= TARGET_PAGE_SIZE || end >= TARGET_PAGE_SIZE)
2306         return -1;
2307     idx = SUBPAGE_IDX(start);
2308     eidx = SUBPAGE_IDX(end);
2309 #if defined(DEBUG_SUBPAGE)
2310     printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n",
2311            __func__, mmio, start, end, idx, eidx, section);
2312 #endif
2313     for (; idx <= eidx; idx++) {
2314         mmio->sub_section[idx] = section;
2315     }
2316
2317     return 0;
2318 }
2319
2320 static subpage_t *subpage_init(AddressSpace *as, hwaddr base)
2321 {
2322     subpage_t *mmio;
2323
2324     mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t));
2325     mmio->as = as;
2326     mmio->base = base;
2327     memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
2328                           NULL, TARGET_PAGE_SIZE);
2329     mmio->iomem.subpage = true;
2330 #if defined(DEBUG_SUBPAGE)
2331     printf("%s: %p base " TARGET_FMT_plx " len %08x\n", __func__,
2332            mmio, base, TARGET_PAGE_SIZE);
2333 #endif
2334     subpage_register(mmio, 0, TARGET_PAGE_SIZE-1, PHYS_SECTION_UNASSIGNED);
2335
2336     return mmio;
2337 }
2338
2339 static uint16_t dummy_section(PhysPageMap *map, AddressSpace *as,
2340                               MemoryRegion *mr)
2341 {
2342     assert(as);
2343     MemoryRegionSection section = {
2344         .address_space = as,
2345         .mr = mr,
2346         .offset_within_address_space = 0,
2347         .offset_within_region = 0,
2348         .size = int128_2_64(),
2349     };
2350
2351     return phys_section_add(map, &section);
2352 }
2353
2354 MemoryRegion *iotlb_to_region(CPUState *cpu, hwaddr index, MemTxAttrs attrs)
2355 {
2356     int asidx = cpu_asidx_from_attrs(cpu, attrs);
2357     CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx];
2358     AddressSpaceDispatch *d = atomic_rcu_read(&cpuas->memory_dispatch);
2359     MemoryRegionSection *sections = d->map.sections;
2360
2361     return sections[index & ~TARGET_PAGE_MASK].mr;
2362 }
2363
2364 static void io_mem_init(void)
2365 {
2366     memory_region_init_io(&io_mem_rom, NULL, &unassigned_mem_ops, NULL, NULL, UINT64_MAX);
2367     memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
2368                           NULL, UINT64_MAX);
2369     memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
2370                           NULL, UINT64_MAX);
2371     memory_region_init_io(&io_mem_watch, NULL, &watch_mem_ops, NULL,
2372                           NULL, UINT64_MAX);
2373 }
2374
2375 static void mem_begin(MemoryListener *listener)
2376 {
2377     AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
2378     AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
2379     uint16_t n;
2380
2381     n = dummy_section(&d->map, as, &io_mem_unassigned);
2382     assert(n == PHYS_SECTION_UNASSIGNED);
2383     n = dummy_section(&d->map, as, &io_mem_notdirty);
2384     assert(n == PHYS_SECTION_NOTDIRTY);
2385     n = dummy_section(&d->map, as, &io_mem_rom);
2386     assert(n == PHYS_SECTION_ROM);
2387     n = dummy_section(&d->map, as, &io_mem_watch);
2388     assert(n == PHYS_SECTION_WATCH);
2389
2390     d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
2391     d->as = as;
2392     as->next_dispatch = d;
2393 }
2394
2395 static void address_space_dispatch_free(AddressSpaceDispatch *d)
2396 {
2397     phys_sections_free(&d->map);
2398     g_free(d);
2399 }
2400
2401 static void mem_commit(MemoryListener *listener)
2402 {
2403     AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
2404     AddressSpaceDispatch *cur = as->dispatch;
2405     AddressSpaceDispatch *next = as->next_dispatch;
2406
2407     phys_page_compact_all(next, next->map.nodes_nb);
2408
2409     atomic_rcu_set(&as->dispatch, next);
2410     if (cur) {
2411         call_rcu(cur, address_space_dispatch_free, rcu);
2412     }
2413 }
2414
2415 static void tcg_commit(MemoryListener *listener)
2416 {
2417     CPUAddressSpace *cpuas;
2418     AddressSpaceDispatch *d;
2419
2420     /* since each CPU stores ram addresses in its TLB cache, we must
2421        reset the modified entries */
2422     cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
2423     cpu_reloading_memory_map();
2424     /* The CPU and TLB are protected by the iothread lock.
2425      * We reload the dispatch pointer now because cpu_reloading_memory_map()
2426      * may have split the RCU critical section.
2427      */
2428     d = atomic_rcu_read(&cpuas->as->dispatch);
2429     atomic_rcu_set(&cpuas->memory_dispatch, d);
2430     tlb_flush(cpuas->cpu, 1);
2431 }
2432
2433 void address_space_init_dispatch(AddressSpace *as)
2434 {
2435     as->dispatch = NULL;
2436     as->dispatch_listener = (MemoryListener) {
2437         .begin = mem_begin,
2438         .commit = mem_commit,
2439         .region_add = mem_add,
2440         .region_nop = mem_add,
2441         .priority = 0,
2442     };
2443     memory_listener_register(&as->dispatch_listener, as);
2444 }
2445
2446 void address_space_unregister(AddressSpace *as)
2447 {
2448     memory_listener_unregister(&as->dispatch_listener);
2449 }
2450
2451 void address_space_destroy_dispatch(AddressSpace *as)
2452 {
2453     AddressSpaceDispatch *d = as->dispatch;
2454
2455     atomic_rcu_set(&as->dispatch, NULL);
2456     if (d) {
2457         call_rcu(d, address_space_dispatch_free, rcu);
2458     }
2459 }
2460
2461 static void memory_map_init(void)
2462 {
2463     system_memory = g_malloc(sizeof(*system_memory));
2464
2465     memory_region_init(system_memory, NULL, "system", UINT64_MAX);
2466     address_space_init(&address_space_memory, system_memory, "memory");
2467
2468     system_io = g_malloc(sizeof(*system_io));
2469     memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
2470                           65536);
2471     address_space_init(&address_space_io, system_io, "I/O");
2472 }
2473
2474 MemoryRegion *get_system_memory(void)
2475 {
2476     return system_memory;
2477 }
2478
2479 MemoryRegion *get_system_io(void)
2480 {
2481     return system_io;
2482 }
2483
2484 #endif /* !defined(CONFIG_USER_ONLY) */
2485
2486 /* physical memory access (slow version, mainly for debug) */
2487 #if defined(CONFIG_USER_ONLY)
2488 int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
2489                         uint8_t *buf, int len, int is_write)
2490 {
2491     int l, flags;
2492     target_ulong page;
2493     void * p;
2494
2495     while (len > 0) {
2496         page = addr & TARGET_PAGE_MASK;
2497         l = (page + TARGET_PAGE_SIZE) - addr;
2498         if (l > len)
2499             l = len;
2500         flags = page_get_flags(page);
2501         if (!(flags & PAGE_VALID))
2502             return -1;
2503         if (is_write) {
2504             if (!(flags & PAGE_WRITE))
2505                 return -1;
2506             /* XXX: this code should not depend on lock_user */
2507             if (!(p = lock_user(VERIFY_WRITE, addr, l, 0)))
2508                 return -1;
2509             memcpy(p, buf, l);
2510             unlock_user(p, addr, l);
2511         } else {
2512             if (!(flags & PAGE_READ))
2513                 return -1;
2514             /* XXX: this code should not depend on lock_user */
2515             if (!(p = lock_user(VERIFY_READ, addr, l, 1)))
2516                 return -1;
2517             memcpy(buf, p, l);
2518             unlock_user(p, addr, 0);
2519         }
2520         len -= l;
2521         buf += l;
2522         addr += l;
2523     }
2524     return 0;
2525 }
2526
2527 #else
2528
2529 static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
2530                                      hwaddr length)
2531 {
2532     uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr);
2533     addr += memory_region_get_ram_addr(mr);
2534
2535     /* No early return if dirty_log_mask is or becomes 0, because
2536      * cpu_physical_memory_set_dirty_range will still call
2537      * xen_modified_memory.
2538      */
2539     if (dirty_log_mask) {
2540         dirty_log_mask =
2541             cpu_physical_memory_range_includes_clean(addr, length, dirty_log_mask);
2542     }
2543     if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
2544         tb_lock();
2545         tb_invalidate_phys_range(addr, addr + length);
2546         tb_unlock();
2547         dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
2548     }
2549     cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask);
2550 }
2551
2552 static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
2553 {
2554     unsigned access_size_max = mr->ops->valid.max_access_size;
2555
2556     /* Regions are assumed to support 1-4 byte accesses unless
2557        otherwise specified.  */
2558     if (access_size_max == 0) {
2559         access_size_max = 4;
2560     }
2561
2562     /* Bound the maximum access by the alignment of the address.  */
2563     if (!mr->ops->impl.unaligned) {
2564         unsigned align_size_max = addr & -addr;
2565         if (align_size_max != 0 && align_size_max < access_size_max) {
2566             access_size_max = align_size_max;
2567         }
2568     }
2569
2570     /* Don't attempt accesses larger than the maximum.  */
2571     if (l > access_size_max) {
2572         l = access_size_max;
2573     }
2574     l = pow2floor(l);
2575
2576     return l;
2577 }
2578
2579 static bool prepare_mmio_access(MemoryRegion *mr)
2580 {
2581     bool unlocked = !qemu_mutex_iothread_locked();
2582     bool release_lock = false;
2583
2584     if (unlocked && mr->global_locking) {
2585         qemu_mutex_lock_iothread();
2586         unlocked = false;
2587         release_lock = true;
2588     }
2589     if (mr->flush_coalesced_mmio) {
2590         if (unlocked) {
2591             qemu_mutex_lock_iothread();
2592         }
2593         qemu_flush_coalesced_mmio_buffer();
2594         if (unlocked) {
2595             qemu_mutex_unlock_iothread();
2596         }
2597     }
2598
2599     return release_lock;
2600 }
2601
2602 /* Called within RCU critical section.  */
2603 static MemTxResult address_space_write_continue(AddressSpace *as, hwaddr addr,
2604                                                 MemTxAttrs attrs,
2605                                                 const uint8_t *buf,
2606                                                 int len, hwaddr addr1,
2607                                                 hwaddr l, MemoryRegion *mr)
2608 {
2609     uint8_t *ptr;
2610     uint64_t val;
2611     MemTxResult result = MEMTX_OK;
2612     bool release_lock = false;
2613
2614     for (;;) {
2615         if (!memory_access_is_direct(mr, true)) {
2616             release_lock |= prepare_mmio_access(mr);
2617             l = memory_access_size(mr, l, addr1);
2618             /* XXX: could force current_cpu to NULL to avoid
2619                potential bugs */
2620             switch (l) {
2621             case 8:
2622                 /* 64 bit write access */
2623                 val = ldq_p(buf);
2624                 result |= memory_region_dispatch_write(mr, addr1, val, 8,
2625                                                        attrs);
2626                 break;
2627             case 4:
2628                 /* 32 bit write access */
2629                 val = ldl_p(buf);
2630                 result |= memory_region_dispatch_write(mr, addr1, val, 4,
2631                                                        attrs);
2632                 break;
2633             case 2:
2634                 /* 16 bit write access */
2635                 val = lduw_p(buf);
2636                 result |= memory_region_dispatch_write(mr, addr1, val, 2,
2637                                                        attrs);
2638                 break;
2639             case 1:
2640                 /* 8 bit write access */
2641                 val = ldub_p(buf);
2642                 result |= memory_region_dispatch_write(mr, addr1, val, 1,
2643                                                        attrs);
2644                 break;
2645             default:
2646                 abort();
2647             }
2648         } else {
2649             /* RAM case */
2650             ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
2651             memcpy(ptr, buf, l);
2652             invalidate_and_set_dirty(mr, addr1, l);
2653         }
2654
2655         if (release_lock) {
2656             qemu_mutex_unlock_iothread();
2657             release_lock = false;
2658         }
2659
2660         len -= l;
2661         buf += l;
2662         addr += l;
2663
2664         if (!len) {
2665             break;
2666         }
2667
2668         l = len;
2669         mr = address_space_translate(as, addr, &addr1, &l, true);
2670     }
2671
2672     return result;
2673 }
2674
2675 MemTxResult address_space_write(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
2676                                 const uint8_t *buf, int len)
2677 {
2678     hwaddr l;
2679     hwaddr addr1;
2680     MemoryRegion *mr;
2681     MemTxResult result = MEMTX_OK;
2682
2683     if (len > 0) {
2684         rcu_read_lock();
2685         l = len;
2686         mr = address_space_translate(as, addr, &addr1, &l, true);
2687         result = address_space_write_continue(as, addr, attrs, buf, len,
2688                                               addr1, l, mr);
2689         rcu_read_unlock();
2690     }
2691
2692     return result;
2693 }
2694
2695 /* Called within RCU critical section.  */
2696 MemTxResult address_space_read_continue(AddressSpace *as, hwaddr addr,
2697                                         MemTxAttrs attrs, uint8_t *buf,
2698                                         int len, hwaddr addr1, hwaddr l,
2699                                         MemoryRegion *mr)
2700 {
2701     uint8_t *ptr;
2702     uint64_t val;
2703     MemTxResult result = MEMTX_OK;
2704     bool release_lock = false;
2705
2706     for (;;) {
2707         if (!memory_access_is_direct(mr, false)) {
2708             /* I/O case */
2709             release_lock |= prepare_mmio_access(mr);
2710             l = memory_access_size(mr, l, addr1);
2711             switch (l) {
2712             case 8:
2713                 /* 64 bit read access */
2714                 result |= memory_region_dispatch_read(mr, addr1, &val, 8,
2715                                                       attrs);
2716                 stq_p(buf, val);
2717                 break;
2718             case 4:
2719                 /* 32 bit read access */
2720                 result |= memory_region_dispatch_read(mr, addr1, &val, 4,
2721                                                       attrs);
2722                 stl_p(buf, val);
2723                 break;
2724             case 2:
2725                 /* 16 bit read access */
2726                 result |= memory_region_dispatch_read(mr, addr1, &val, 2,
2727                                                       attrs);
2728                 stw_p(buf, val);
2729                 break;
2730             case 1:
2731                 /* 8 bit read access */
2732                 result |= memory_region_dispatch_read(mr, addr1, &val, 1,
2733                                                       attrs);
2734                 stb_p(buf, val);
2735                 break;
2736             default:
2737                 abort();
2738             }
2739         } else {
2740             /* RAM case */
2741             ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
2742             memcpy(buf, ptr, l);
2743         }
2744
2745         if (release_lock) {
2746             qemu_mutex_unlock_iothread();
2747             release_lock = false;
2748         }
2749
2750         len -= l;
2751         buf += l;
2752         addr += l;
2753
2754         if (!len) {
2755             break;
2756         }
2757
2758         l = len;
2759         mr = address_space_translate(as, addr, &addr1, &l, false);
2760     }
2761
2762     return result;
2763 }
2764
2765 MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
2766                                     MemTxAttrs attrs, uint8_t *buf, int len)
2767 {
2768     hwaddr l;
2769     hwaddr addr1;
2770     MemoryRegion *mr;
2771     MemTxResult result = MEMTX_OK;
2772
2773     if (len > 0) {
2774         rcu_read_lock();
2775         l = len;
2776         mr = address_space_translate(as, addr, &addr1, &l, false);
2777         result = address_space_read_continue(as, addr, attrs, buf, len,
2778                                              addr1, l, mr);
2779         rcu_read_unlock();
2780     }
2781
2782     return result;
2783 }
2784
2785 MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
2786                              uint8_t *buf, int len, bool is_write)
2787 {
2788     if (is_write) {
2789         return address_space_write(as, addr, attrs, (uint8_t *)buf, len);
2790     } else {
2791         return address_space_read(as, addr, attrs, (uint8_t *)buf, len);
2792     }
2793 }
2794
2795 void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
2796                             int len, int is_write)
2797 {
2798     address_space_rw(&address_space_memory, addr, MEMTXATTRS_UNSPECIFIED,
2799                      buf, len, is_write);
2800 }
2801
2802 enum write_rom_type {
2803     WRITE_DATA,
2804     FLUSH_CACHE,
2805 };
2806
2807 static inline void cpu_physical_memory_write_rom_internal(AddressSpace *as,
2808     hwaddr addr, const uint8_t *buf, int len, enum write_rom_type type)
2809 {
2810     hwaddr l;
2811     uint8_t *ptr;
2812     hwaddr addr1;
2813     MemoryRegion *mr;
2814
2815     rcu_read_lock();
2816     while (len > 0) {
2817         l = len;
2818         mr = address_space_translate(as, addr, &addr1, &l, true);
2819
2820         if (!(memory_region_is_ram(mr) ||
2821               memory_region_is_romd(mr))) {
2822             l = memory_access_size(mr, l, addr1);
2823         } else {
2824             /* ROM/RAM case */
2825             ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
2826             switch (type) {
2827             case WRITE_DATA:
2828                 memcpy(ptr, buf, l);
2829                 invalidate_and_set_dirty(mr, addr1, l);
2830                 break;
2831             case FLUSH_CACHE:
2832                 flush_icache_range((uintptr_t)ptr, (uintptr_t)ptr + l);
2833                 break;
2834             }
2835         }
2836         len -= l;
2837         buf += l;
2838         addr += l;
2839     }
2840     rcu_read_unlock();
2841 }
2842
2843 /* used for ROM loading : can write in RAM and ROM */
2844 void cpu_physical_memory_write_rom(AddressSpace *as, hwaddr addr,
2845                                    const uint8_t *buf, int len)
2846 {
2847     cpu_physical_memory_write_rom_internal(as, addr, buf, len, WRITE_DATA);
2848 }
2849
2850 void cpu_flush_icache_range(hwaddr start, int len)
2851 {
2852     /*
2853      * This function should do the same thing as an icache flush that was
2854      * triggered from within the guest. For TCG we are always cache coherent,
2855      * so there is no need to flush anything. For KVM / Xen we need to flush
2856      * the host's instruction cache at least.
2857      */
2858     if (tcg_enabled()) {
2859         return;
2860     }
2861
2862     cpu_physical_memory_write_rom_internal(&address_space_memory,
2863                                            start, NULL, len, FLUSH_CACHE);
2864 }
2865
2866 typedef struct {
2867     MemoryRegion *mr;
2868     void *buffer;
2869     hwaddr addr;
2870     hwaddr len;
2871     bool in_use;
2872 } BounceBuffer;
2873
2874 static BounceBuffer bounce;
2875
2876 typedef struct MapClient {
2877     QEMUBH *bh;
2878     QLIST_ENTRY(MapClient) link;
2879 } MapClient;
2880
2881 QemuMutex map_client_list_lock;
2882 static QLIST_HEAD(map_client_list, MapClient) map_client_list
2883     = QLIST_HEAD_INITIALIZER(map_client_list);
2884
2885 static void cpu_unregister_map_client_do(MapClient *client)
2886 {
2887     QLIST_REMOVE(client, link);
2888     g_free(client);
2889 }
2890
2891 static void cpu_notify_map_clients_locked(void)
2892 {
2893     MapClient *client;
2894
2895     while (!QLIST_EMPTY(&map_client_list)) {
2896         client = QLIST_FIRST(&map_client_list);
2897         qemu_bh_schedule(client->bh);
2898         cpu_unregister_map_client_do(client);
2899     }
2900 }
2901
2902 void cpu_register_map_client(QEMUBH *bh)
2903 {
2904     MapClient *client = g_malloc(sizeof(*client));
2905
2906     qemu_mutex_lock(&map_client_list_lock);
2907     client->bh = bh;
2908     QLIST_INSERT_HEAD(&map_client_list, client, link);
2909     if (!atomic_read(&bounce.in_use)) {
2910         cpu_notify_map_clients_locked();
2911     }
2912     qemu_mutex_unlock(&map_client_list_lock);
2913 }
2914
2915 void cpu_exec_init_all(void)
2916 {
2917     qemu_mutex_init(&ram_list.mutex);
2918     /* The data structures we set up here depend on knowing the page size,
2919      * so no more changes can be made after this point.
2920      * In an ideal world, nothing we did before we had finished the
2921      * machine setup would care about the target page size, and we could
2922      * do this much later, rather than requiring board models to state
2923      * up front what their requirements are.
2924      */
2925     finalize_target_page_bits();
2926     io_mem_init();
2927     memory_map_init();
2928     qemu_mutex_init(&map_client_list_lock);
2929 }
2930
2931 void cpu_unregister_map_client(QEMUBH *bh)
2932 {
2933     MapClient *client;
2934
2935     qemu_mutex_lock(&map_client_list_lock);
2936     QLIST_FOREACH(client, &map_client_list, link) {
2937         if (client->bh == bh) {
2938             cpu_unregister_map_client_do(client);
2939             break;
2940         }
2941     }
2942     qemu_mutex_unlock(&map_client_list_lock);
2943 }
2944
2945 static void cpu_notify_map_clients(void)
2946 {
2947     qemu_mutex_lock(&map_client_list_lock);
2948     cpu_notify_map_clients_locked();
2949     qemu_mutex_unlock(&map_client_list_lock);
2950 }
2951
2952 bool address_space_access_valid(AddressSpace *as, hwaddr addr, int len, bool is_write)
2953 {
2954     MemoryRegion *mr;
2955     hwaddr l, xlat;
2956
2957     rcu_read_lock();
2958     while (len > 0) {
2959         l = len;
2960         mr = address_space_translate(as, addr, &xlat, &l, is_write);
2961         if (!memory_access_is_direct(mr, is_write)) {
2962             l = memory_access_size(mr, l, addr);
2963             if (!memory_region_access_valid(mr, xlat, l, is_write)) {
2964                 return false;
2965             }
2966         }
2967
2968         len -= l;
2969         addr += l;
2970     }
2971     rcu_read_unlock();
2972     return true;
2973 }
2974
2975 /* Map a physical memory region into a host virtual address.
2976  * May map a subset of the requested range, given by and returned in *plen.
2977  * May return NULL if resources needed to perform the mapping are exhausted.
2978  * Use only for reads OR writes - not for read-modify-write operations.
2979  * Use cpu_register_map_client() to know when retrying the map operation is
2980  * likely to succeed.
2981  */
2982 void *address_space_map(AddressSpace *as,
2983                         hwaddr addr,
2984                         hwaddr *plen,
2985                         bool is_write)
2986 {
2987     hwaddr len = *plen;
2988     hwaddr done = 0;
2989     hwaddr l, xlat, base;
2990     MemoryRegion *mr, *this_mr;
2991     void *ptr;
2992
2993     if (len == 0) {
2994         return NULL;
2995     }
2996
2997     l = len;
2998     rcu_read_lock();
2999     mr = address_space_translate(as, addr, &xlat, &l, is_write);
3000
3001     if (!memory_access_is_direct(mr, is_write)) {
3002         if (atomic_xchg(&bounce.in_use, true)) {
3003             rcu_read_unlock();
3004             return NULL;
3005         }
3006         /* Avoid unbounded allocations */
3007         l = MIN(l, TARGET_PAGE_SIZE);
3008         bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
3009         bounce.addr = addr;
3010         bounce.len = l;
3011
3012         memory_region_ref(mr);
3013         bounce.mr = mr;
3014         if (!is_write) {
3015             address_space_read(as, addr, MEMTXATTRS_UNSPECIFIED,
3016                                bounce.buffer, l);
3017         }
3018
3019         rcu_read_unlock();
3020         *plen = l;
3021         return bounce.buffer;
3022     }
3023
3024     base = xlat;
3025
3026     for (;;) {
3027         len -= l;
3028         addr += l;
3029         done += l;
3030         if (len == 0) {
3031             break;
3032         }
3033
3034         l = len;
3035         this_mr = address_space_translate(as, addr, &xlat, &l, is_write);
3036         if (this_mr != mr || xlat != base + done) {
3037             break;
3038         }
3039     }
3040
3041     memory_region_ref(mr);
3042     *plen = done;
3043     ptr = qemu_ram_ptr_length(mr->ram_block, base, plen);
3044     rcu_read_unlock();
3045
3046     return ptr;
3047 }
3048
3049 /* Unmaps a memory region previously mapped by address_space_map().
3050  * Will also mark the memory as dirty if is_write == 1.  access_len gives
3051  * the amount of memory that was actually read or written by the caller.
3052  */
3053 void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
3054                          int is_write, hwaddr access_len)
3055 {
3056     if (buffer != bounce.buffer) {
3057         MemoryRegion *mr;
3058         ram_addr_t addr1;
3059
3060         mr = memory_region_from_host(buffer, &addr1);
3061         assert(mr != NULL);
3062         if (is_write) {
3063             invalidate_and_set_dirty(mr, addr1, access_len);
3064         }
3065         if (xen_enabled()) {
3066             xen_invalidate_map_cache_entry(buffer);
3067         }
3068         memory_region_unref(mr);
3069         return;
3070     }
3071     if (is_write) {
3072         address_space_write(as, bounce.addr, MEMTXATTRS_UNSPECIFIED,
3073                             bounce.buffer, access_len);
3074     }
3075     qemu_vfree(bounce.buffer);
3076     bounce.buffer = NULL;
3077     memory_region_unref(bounce.mr);
3078     atomic_mb_set(&bounce.in_use, false);
3079     cpu_notify_map_clients();
3080 }
3081
3082 void *cpu_physical_memory_map(hwaddr addr,
3083                               hwaddr *plen,
3084                               int is_write)
3085 {
3086     return address_space_map(&address_space_memory, addr, plen, is_write);
3087 }
3088
3089 void cpu_physical_memory_unmap(void *buffer, hwaddr len,
3090                                int is_write, hwaddr access_len)
3091 {
3092     return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
3093 }
3094
3095 /* warning: addr must be aligned */
3096 static inline uint32_t address_space_ldl_internal(AddressSpace *as, hwaddr addr,
3097                                                   MemTxAttrs attrs,
3098                                                   MemTxResult *result,
3099                                                   enum device_endian endian)
3100 {
3101     uint8_t *ptr;
3102     uint64_t val;
3103     MemoryRegion *mr;
3104     hwaddr l = 4;
3105     hwaddr addr1;
3106     MemTxResult r;
3107     bool release_lock = false;
3108
3109     rcu_read_lock();
3110     mr = address_space_translate(as, addr, &addr1, &l, false);
3111     if (l < 4 || !memory_access_is_direct(mr, false)) {
3112         release_lock |= prepare_mmio_access(mr);
3113
3114         /* I/O case */
3115         r = memory_region_dispatch_read(mr, addr1, &val, 4, attrs);
3116 #if defined(TARGET_WORDS_BIGENDIAN)
3117         if (endian == DEVICE_LITTLE_ENDIAN) {
3118             val = bswap32(val);
3119         }
3120 #else
3121         if (endian == DEVICE_BIG_ENDIAN) {
3122             val = bswap32(val);
3123         }
3124 #endif
3125     } else {
3126         /* RAM case */
3127         ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
3128         switch (endian) {
3129         case DEVICE_LITTLE_ENDIAN:
3130             val = ldl_le_p(ptr);
3131             break;
3132         case DEVICE_BIG_ENDIAN:
3133             val = ldl_be_p(ptr);
3134             break;
3135         default:
3136             val = ldl_p(ptr);
3137             break;
3138         }
3139         r = MEMTX_OK;
3140     }
3141     if (result) {
3142         *result = r;
3143     }
3144     if (release_lock) {
3145         qemu_mutex_unlock_iothread();
3146     }
3147     rcu_read_unlock();
3148     return val;
3149 }
3150
3151 uint32_t address_space_ldl(AddressSpace *as, hwaddr addr,
3152                            MemTxAttrs attrs, MemTxResult *result)
3153 {
3154     return address_space_ldl_internal(as, addr, attrs, result,
3155                                       DEVICE_NATIVE_ENDIAN);
3156 }
3157
3158 uint32_t address_space_ldl_le(AddressSpace *as, hwaddr addr,
3159                               MemTxAttrs attrs, MemTxResult *result)
3160 {
3161     return address_space_ldl_internal(as, addr, attrs, result,
3162                                       DEVICE_LITTLE_ENDIAN);
3163 }
3164
3165 uint32_t address_space_ldl_be(AddressSpace *as, hwaddr addr,
3166                               MemTxAttrs attrs, MemTxResult *result)
3167 {
3168     return address_space_ldl_internal(as, addr, attrs, result,
3169                                       DEVICE_BIG_ENDIAN);
3170 }
3171
3172 uint32_t ldl_phys(AddressSpace *as, hwaddr addr)
3173 {
3174     return address_space_ldl(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3175 }
3176
3177 uint32_t ldl_le_phys(AddressSpace *as, hwaddr addr)
3178 {
3179     return address_space_ldl_le(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3180 }
3181
3182 uint32_t ldl_be_phys(AddressSpace *as, hwaddr addr)
3183 {
3184     return address_space_ldl_be(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3185 }
3186
3187 /* warning: addr must be aligned */
3188 static inline uint64_t address_space_ldq_internal(AddressSpace *as, hwaddr addr,
3189                                                   MemTxAttrs attrs,
3190                                                   MemTxResult *result,
3191                                                   enum device_endian endian)
3192 {
3193     uint8_t *ptr;
3194     uint64_t val;
3195     MemoryRegion *mr;
3196     hwaddr l = 8;
3197     hwaddr addr1;
3198     MemTxResult r;
3199     bool release_lock = false;
3200
3201     rcu_read_lock();
3202     mr = address_space_translate(as, addr, &addr1, &l,
3203                                  false);
3204     if (l < 8 || !memory_access_is_direct(mr, false)) {
3205         release_lock |= prepare_mmio_access(mr);
3206
3207         /* I/O case */
3208         r = memory_region_dispatch_read(mr, addr1, &val, 8, attrs);
3209 #if defined(TARGET_WORDS_BIGENDIAN)
3210         if (endian == DEVICE_LITTLE_ENDIAN) {
3211             val = bswap64(val);
3212         }
3213 #else
3214         if (endian == DEVICE_BIG_ENDIAN) {
3215             val = bswap64(val);
3216         }
3217 #endif
3218     } else {
3219         /* RAM case */
3220         ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
3221         switch (endian) {
3222         case DEVICE_LITTLE_ENDIAN:
3223             val = ldq_le_p(ptr);
3224             break;
3225         case DEVICE_BIG_ENDIAN:
3226             val = ldq_be_p(ptr);
3227             break;
3228         default:
3229             val = ldq_p(ptr);
3230             break;
3231         }
3232         r = MEMTX_OK;
3233     }
3234     if (result) {
3235         *result = r;
3236     }
3237     if (release_lock) {
3238         qemu_mutex_unlock_iothread();
3239     }
3240     rcu_read_unlock();
3241     return val;
3242 }
3243
3244 uint64_t address_space_ldq(AddressSpace *as, hwaddr addr,
3245                            MemTxAttrs attrs, MemTxResult *result)
3246 {
3247     return address_space_ldq_internal(as, addr, attrs, result,
3248                                       DEVICE_NATIVE_ENDIAN);
3249 }
3250
3251 uint64_t address_space_ldq_le(AddressSpace *as, hwaddr addr,
3252                            MemTxAttrs attrs, MemTxResult *result)
3253 {
3254     return address_space_ldq_internal(as, addr, attrs, result,
3255                                       DEVICE_LITTLE_ENDIAN);
3256 }
3257
3258 uint64_t address_space_ldq_be(AddressSpace *as, hwaddr addr,
3259                            MemTxAttrs attrs, MemTxResult *result)
3260 {
3261     return address_space_ldq_internal(as, addr, attrs, result,
3262                                       DEVICE_BIG_ENDIAN);
3263 }
3264
3265 uint64_t ldq_phys(AddressSpace *as, hwaddr addr)
3266 {
3267     return address_space_ldq(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3268 }
3269
3270 uint64_t ldq_le_phys(AddressSpace *as, hwaddr addr)
3271 {
3272     return address_space_ldq_le(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3273 }
3274
3275 uint64_t ldq_be_phys(AddressSpace *as, hwaddr addr)
3276 {
3277     return address_space_ldq_be(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3278 }
3279
3280 /* XXX: optimize */
3281 uint32_t address_space_ldub(AddressSpace *as, hwaddr addr,
3282                             MemTxAttrs attrs, MemTxResult *result)
3283 {
3284     uint8_t val;
3285     MemTxResult r;
3286
3287     r = address_space_rw(as, addr, attrs, &val, 1, 0);
3288     if (result) {
3289         *result = r;
3290     }
3291     return val;
3292 }
3293
3294 uint32_t ldub_phys(AddressSpace *as, hwaddr addr)
3295 {
3296     return address_space_ldub(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3297 }
3298
3299 /* warning: addr must be aligned */
3300 static inline uint32_t address_space_lduw_internal(AddressSpace *as,
3301                                                    hwaddr addr,
3302                                                    MemTxAttrs attrs,
3303                                                    MemTxResult *result,
3304                                                    enum device_endian endian)
3305 {
3306     uint8_t *ptr;
3307     uint64_t val;
3308     MemoryRegion *mr;
3309     hwaddr l = 2;
3310     hwaddr addr1;
3311     MemTxResult r;
3312     bool release_lock = false;
3313
3314     rcu_read_lock();
3315     mr = address_space_translate(as, addr, &addr1, &l,
3316                                  false);
3317     if (l < 2 || !memory_access_is_direct(mr, false)) {
3318         release_lock |= prepare_mmio_access(mr);
3319
3320         /* I/O case */
3321         r = memory_region_dispatch_read(mr, addr1, &val, 2, attrs);
3322 #if defined(TARGET_WORDS_BIGENDIAN)
3323         if (endian == DEVICE_LITTLE_ENDIAN) {
3324             val = bswap16(val);
3325         }
3326 #else
3327         if (endian == DEVICE_BIG_ENDIAN) {
3328             val = bswap16(val);
3329         }
3330 #endif
3331     } else {
3332         /* RAM case */
3333         ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
3334         switch (endian) {
3335         case DEVICE_LITTLE_ENDIAN:
3336             val = lduw_le_p(ptr);
3337             break;
3338         case DEVICE_BIG_ENDIAN:
3339             val = lduw_be_p(ptr);
3340             break;
3341         default:
3342             val = lduw_p(ptr);
3343             break;
3344         }
3345         r = MEMTX_OK;
3346     }
3347     if (result) {
3348         *result = r;
3349     }
3350     if (release_lock) {
3351         qemu_mutex_unlock_iothread();
3352     }
3353     rcu_read_unlock();
3354     return val;
3355 }
3356
3357 uint32_t address_space_lduw(AddressSpace *as, hwaddr addr,
3358                            MemTxAttrs attrs, MemTxResult *result)
3359 {
3360     return address_space_lduw_internal(as, addr, attrs, result,
3361                                        DEVICE_NATIVE_ENDIAN);
3362 }
3363
3364 uint32_t address_space_lduw_le(AddressSpace *as, hwaddr addr,
3365                            MemTxAttrs attrs, MemTxResult *result)
3366 {
3367     return address_space_lduw_internal(as, addr, attrs, result,
3368                                        DEVICE_LITTLE_ENDIAN);
3369 }
3370
3371 uint32_t address_space_lduw_be(AddressSpace *as, hwaddr addr,
3372                            MemTxAttrs attrs, MemTxResult *result)
3373 {
3374     return address_space_lduw_internal(as, addr, attrs, result,
3375                                        DEVICE_BIG_ENDIAN);
3376 }
3377
3378 uint32_t lduw_phys(AddressSpace *as, hwaddr addr)
3379 {
3380     return address_space_lduw(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3381 }
3382
3383 uint32_t lduw_le_phys(AddressSpace *as, hwaddr addr)
3384 {
3385     return address_space_lduw_le(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3386 }
3387
3388 uint32_t lduw_be_phys(AddressSpace *as, hwaddr addr)
3389 {
3390     return address_space_lduw_be(as, addr, MEMTXATTRS_UNSPECIFIED, NULL);
3391 }
3392
3393 /* warning: addr must be aligned. The ram page is not masked as dirty
3394    and the code inside is not invalidated. It is useful if the dirty
3395    bits are used to track modified PTEs */
3396 void address_space_stl_notdirty(AddressSpace *as, hwaddr addr, uint32_t val,
3397                                 MemTxAttrs attrs, MemTxResult *result)
3398 {
3399     uint8_t *ptr;
3400     MemoryRegion *mr;
3401     hwaddr l = 4;
3402     hwaddr addr1;
3403     MemTxResult r;
3404     uint8_t dirty_log_mask;
3405     bool release_lock = false;
3406
3407     rcu_read_lock();
3408     mr = address_space_translate(as, addr, &addr1, &l,
3409                                  true);
3410     if (l < 4 || !memory_access_is_direct(mr, true)) {
3411         release_lock |= prepare_mmio_access(mr);
3412
3413         r = memory_region_dispatch_write(mr, addr1, val, 4, attrs);
3414     } else {
3415         ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
3416         stl_p(ptr, val);
3417
3418         dirty_log_mask = memory_region_get_dirty_log_mask(mr);
3419         dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
3420         cpu_physical_memory_set_dirty_range(memory_region_get_ram_addr(mr) + addr,
3421                                             4, dirty_log_mask);
3422         r = MEMTX_OK;
3423     }
3424     if (result) {
3425         *result = r;
3426     }
3427     if (release_lock) {
3428         qemu_mutex_unlock_iothread();
3429     }
3430     rcu_read_unlock();
3431 }
3432
3433 void stl_phys_notdirty(AddressSpace *as, hwaddr addr, uint32_t val)
3434 {
3435     address_space_stl_notdirty(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3436 }
3437
3438 /* warning: addr must be aligned */
3439 static inline void address_space_stl_internal(AddressSpace *as,
3440                                               hwaddr addr, uint32_t val,
3441                                               MemTxAttrs attrs,
3442                                               MemTxResult *result,
3443                                               enum device_endian endian)
3444 {
3445     uint8_t *ptr;
3446     MemoryRegion *mr;
3447     hwaddr l = 4;
3448     hwaddr addr1;
3449     MemTxResult r;
3450     bool release_lock = false;
3451
3452     rcu_read_lock();
3453     mr = address_space_translate(as, addr, &addr1, &l,
3454                                  true);
3455     if (l < 4 || !memory_access_is_direct(mr, true)) {
3456         release_lock |= prepare_mmio_access(mr);
3457
3458 #if defined(TARGET_WORDS_BIGENDIAN)
3459         if (endian == DEVICE_LITTLE_ENDIAN) {
3460             val = bswap32(val);
3461         }
3462 #else
3463         if (endian == DEVICE_BIG_ENDIAN) {
3464             val = bswap32(val);
3465         }
3466 #endif
3467         r = memory_region_dispatch_write(mr, addr1, val, 4, attrs);
3468     } else {
3469         /* RAM case */
3470         ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
3471         switch (endian) {
3472         case DEVICE_LITTLE_ENDIAN:
3473             stl_le_p(ptr, val);
3474             break;
3475         case DEVICE_BIG_ENDIAN:
3476             stl_be_p(ptr, val);
3477             break;
3478         default:
3479             stl_p(ptr, val);
3480             break;
3481         }
3482         invalidate_and_set_dirty(mr, addr1, 4);
3483         r = MEMTX_OK;
3484     }
3485     if (result) {
3486         *result = r;
3487     }
3488     if (release_lock) {
3489         qemu_mutex_unlock_iothread();
3490     }
3491     rcu_read_unlock();
3492 }
3493
3494 void address_space_stl(AddressSpace *as, hwaddr addr, uint32_t val,
3495                        MemTxAttrs attrs, MemTxResult *result)
3496 {
3497     address_space_stl_internal(as, addr, val, attrs, result,
3498                                DEVICE_NATIVE_ENDIAN);
3499 }
3500
3501 void address_space_stl_le(AddressSpace *as, hwaddr addr, uint32_t val,
3502                        MemTxAttrs attrs, MemTxResult *result)
3503 {
3504     address_space_stl_internal(as, addr, val, attrs, result,
3505                                DEVICE_LITTLE_ENDIAN);
3506 }
3507
3508 void address_space_stl_be(AddressSpace *as, hwaddr addr, uint32_t val,
3509                        MemTxAttrs attrs, MemTxResult *result)
3510 {
3511     address_space_stl_internal(as, addr, val, attrs, result,
3512                                DEVICE_BIG_ENDIAN);
3513 }
3514
3515 void stl_phys(AddressSpace *as, hwaddr addr, uint32_t val)
3516 {
3517     address_space_stl(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3518 }
3519
3520 void stl_le_phys(AddressSpace *as, hwaddr addr, uint32_t val)
3521 {
3522     address_space_stl_le(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3523 }
3524
3525 void stl_be_phys(AddressSpace *as, hwaddr addr, uint32_t val)
3526 {
3527     address_space_stl_be(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3528 }
3529
3530 /* XXX: optimize */
3531 void address_space_stb(AddressSpace *as, hwaddr addr, uint32_t val,
3532                        MemTxAttrs attrs, MemTxResult *result)
3533 {
3534     uint8_t v = val;
3535     MemTxResult r;
3536
3537     r = address_space_rw(as, addr, attrs, &v, 1, 1);
3538     if (result) {
3539         *result = r;
3540     }
3541 }
3542
3543 void stb_phys(AddressSpace *as, hwaddr addr, uint32_t val)
3544 {
3545     address_space_stb(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3546 }
3547
3548 /* warning: addr must be aligned */
3549 static inline void address_space_stw_internal(AddressSpace *as,
3550                                               hwaddr addr, uint32_t val,
3551                                               MemTxAttrs attrs,
3552                                               MemTxResult *result,
3553                                               enum device_endian endian)
3554 {
3555     uint8_t *ptr;
3556     MemoryRegion *mr;
3557     hwaddr l = 2;
3558     hwaddr addr1;
3559     MemTxResult r;
3560     bool release_lock = false;
3561
3562     rcu_read_lock();
3563     mr = address_space_translate(as, addr, &addr1, &l, true);
3564     if (l < 2 || !memory_access_is_direct(mr, true)) {
3565         release_lock |= prepare_mmio_access(mr);
3566
3567 #if defined(TARGET_WORDS_BIGENDIAN)
3568         if (endian == DEVICE_LITTLE_ENDIAN) {
3569             val = bswap16(val);
3570         }
3571 #else
3572         if (endian == DEVICE_BIG_ENDIAN) {
3573             val = bswap16(val);
3574         }
3575 #endif
3576         r = memory_region_dispatch_write(mr, addr1, val, 2, attrs);
3577     } else {
3578         /* RAM case */
3579         ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
3580         switch (endian) {
3581         case DEVICE_LITTLE_ENDIAN:
3582             stw_le_p(ptr, val);
3583             break;
3584         case DEVICE_BIG_ENDIAN:
3585             stw_be_p(ptr, val);
3586             break;
3587         default:
3588             stw_p(ptr, val);
3589             break;
3590         }
3591         invalidate_and_set_dirty(mr, addr1, 2);
3592         r = MEMTX_OK;
3593     }
3594     if (result) {
3595         *result = r;
3596     }
3597     if (release_lock) {
3598         qemu_mutex_unlock_iothread();
3599     }
3600     rcu_read_unlock();
3601 }
3602
3603 void address_space_stw(AddressSpace *as, hwaddr addr, uint32_t val,
3604                        MemTxAttrs attrs, MemTxResult *result)
3605 {
3606     address_space_stw_internal(as, addr, val, attrs, result,
3607                                DEVICE_NATIVE_ENDIAN);
3608 }
3609
3610 void address_space_stw_le(AddressSpace *as, hwaddr addr, uint32_t val,
3611                        MemTxAttrs attrs, MemTxResult *result)
3612 {
3613     address_space_stw_internal(as, addr, val, attrs, result,
3614                                DEVICE_LITTLE_ENDIAN);
3615 }
3616
3617 void address_space_stw_be(AddressSpace *as, hwaddr addr, uint32_t val,
3618                        MemTxAttrs attrs, MemTxResult *result)
3619 {
3620     address_space_stw_internal(as, addr, val, attrs, result,
3621                                DEVICE_BIG_ENDIAN);
3622 }
3623
3624 void stw_phys(AddressSpace *as, hwaddr addr, uint32_t val)
3625 {
3626     address_space_stw(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3627 }
3628
3629 void stw_le_phys(AddressSpace *as, hwaddr addr, uint32_t val)
3630 {
3631     address_space_stw_le(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3632 }
3633
3634 void stw_be_phys(AddressSpace *as, hwaddr addr, uint32_t val)
3635 {
3636     address_space_stw_be(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3637 }
3638
3639 /* XXX: optimize */
3640 void address_space_stq(AddressSpace *as, hwaddr addr, uint64_t val,
3641                        MemTxAttrs attrs, MemTxResult *result)
3642 {
3643     MemTxResult r;
3644     val = tswap64(val);
3645     r = address_space_rw(as, addr, attrs, (void *) &val, 8, 1);
3646     if (result) {
3647         *result = r;
3648     }
3649 }
3650
3651 void address_space_stq_le(AddressSpace *as, hwaddr addr, uint64_t val,
3652                        MemTxAttrs attrs, MemTxResult *result)
3653 {
3654     MemTxResult r;
3655     val = cpu_to_le64(val);
3656     r = address_space_rw(as, addr, attrs, (void *) &val, 8, 1);
3657     if (result) {
3658         *result = r;
3659     }
3660 }
3661 void address_space_stq_be(AddressSpace *as, hwaddr addr, uint64_t val,
3662                        MemTxAttrs attrs, MemTxResult *result)
3663 {
3664     MemTxResult r;
3665     val = cpu_to_be64(val);
3666     r = address_space_rw(as, addr, attrs, (void *) &val, 8, 1);
3667     if (result) {
3668         *result = r;
3669     }
3670 }
3671
3672 void stq_phys(AddressSpace *as, hwaddr addr, uint64_t val)
3673 {
3674     address_space_stq(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3675 }
3676
3677 void stq_le_phys(AddressSpace *as, hwaddr addr, uint64_t val)
3678 {
3679     address_space_stq_le(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3680 }
3681
3682 void stq_be_phys(AddressSpace *as, hwaddr addr, uint64_t val)
3683 {
3684     address_space_stq_be(as, addr, val, MEMTXATTRS_UNSPECIFIED, NULL);
3685 }
3686
3687 /* virtual memory access for debug (includes writing to ROM) */
3688 int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
3689                         uint8_t *buf, int len, int is_write)
3690 {
3691     int l;
3692     hwaddr phys_addr;
3693     target_ulong page;
3694
3695     while (len > 0) {
3696         int asidx;
3697         MemTxAttrs attrs;
3698
3699         page = addr & TARGET_PAGE_MASK;
3700         phys_addr = cpu_get_phys_page_attrs_debug(cpu, page, &attrs);
3701         asidx = cpu_asidx_from_attrs(cpu, attrs);
3702         /* if no physical page mapped, return an error */
3703         if (phys_addr == -1)
3704             return -1;
3705         l = (page + TARGET_PAGE_SIZE) - addr;
3706         if (l > len)
3707             l = len;
3708         phys_addr += (addr & ~TARGET_PAGE_MASK);
3709         if (is_write) {
3710             cpu_physical_memory_write_rom(cpu->cpu_ases[asidx].as,
3711                                           phys_addr, buf, l);
3712         } else {
3713             address_space_rw(cpu->cpu_ases[asidx].as, phys_addr,
3714                              MEMTXATTRS_UNSPECIFIED,
3715                              buf, l, 0);
3716         }
3717         len -= l;
3718         buf += l;
3719         addr += l;
3720     }
3721     return 0;
3722 }
3723
3724 /*
3725  * Allows code that needs to deal with migration bitmaps etc to still be built
3726  * target independent.
3727  */
3728 size_t qemu_target_page_bits(void)
3729 {
3730     return TARGET_PAGE_BITS;
3731 }
3732
3733 #endif
3734
3735 /*
3736  * A helper function for the _utterly broken_ virtio device model to find out if
3737  * it's running on a big endian machine. Don't do this at home kids!
3738  */
3739 bool target_words_bigendian(void);
3740 bool target_words_bigendian(void)
3741 {
3742 #if defined(TARGET_WORDS_BIGENDIAN)
3743     return true;
3744 #else
3745     return false;
3746 #endif
3747 }
3748
3749 #ifndef CONFIG_USER_ONLY
3750 bool cpu_physical_memory_is_io(hwaddr phys_addr)
3751 {
3752     MemoryRegion*mr;
3753     hwaddr l = 1;
3754     bool res;
3755
3756     rcu_read_lock();
3757     mr = address_space_translate(&address_space_memory,
3758                                  phys_addr, &phys_addr, &l, false);
3759
3760     res = !(memory_region_is_ram(mr) || memory_region_is_romd(mr));
3761     rcu_read_unlock();
3762     return res;
3763 }
3764
3765 int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
3766 {
3767     RAMBlock *block;
3768     int ret = 0;
3769
3770     rcu_read_lock();
3771     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
3772         ret = func(block->idstr, block->host, block->offset,
3773                    block->used_length, opaque);
3774         if (ret) {
3775             break;
3776         }
3777     }
3778     rcu_read_unlock();
3779     return ret;
3780 }
3781 #endif