exec.c

   1 /*
   2  *  Virtual page mapping
   3  *
   4  *  Copyright (c) 2003 Fabrice Bellard
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19 #include "config.h"
  20 #ifndef _WIN32
  21 #include <sys/types.h>
  22 #include <sys/mman.h>
  23 #endif
  24
  25 #include "qemu-common.h"
  26 #include "cpu.h"
  27 #include "tcg.h"
  28 #include "hw/hw.h"
  29 #include "hw/qdev.h"
  30 #include "qemu/osdep.h"
  31 #include "sysemu/kvm.h"
  32 #include "sysemu/sysemu.h"
  33 #include "hw/xen/xen.h"
  34 #include "qemu/timer.h"
  35 #include "qemu/config-file.h"
  36 #include "qemu/error-report.h"
  37 #include "exec/memory.h"
  38 #include "sysemu/dma.h"
  39 #include "exec/address-spaces.h"
  40 #if defined(CONFIG_USER_ONLY)
  41 #include <qemu.h>
  42 #else /* !CONFIG_USER_ONLY */
  43 #include "sysemu/xen-mapcache.h"
  44 #include "trace.h"
  45 #endif
  46 #include "exec/cpu-all.h"
  47 #include "qemu/rcu_queue.h"
  48 #include "exec/cputlb.h"
  49 #include "translate-all.h"
  50
  51 #include "exec/memory-internal.h"
  52 #include "exec/ram_addr.h"
  53
  54 #include "qemu/range.h"
  55
  56 //#define DEBUG_SUBPAGE
  57
  58 #if !defined(CONFIG_USER_ONLY)
  59 static bool in_migration;
  60
  61 /* ram_list is read under rcu_read_lock()/rcu_read_unlock().  Writes
  62  * are protected by the ramlist lock.
  63  */
  64 RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
  65
  66 static MemoryRegion *system_memory;
  67 static MemoryRegion *system_io;
  68
  69 AddressSpace address_space_io;
  70 AddressSpace address_space_memory;
  71
  72 MemoryRegion io_mem_rom, io_mem_notdirty;
  73 static MemoryRegion io_mem_unassigned;
  74
  75 /* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */
  76 #define RAM_PREALLOC   (1 << 0)
  77
  78 /* RAM is mmap-ed with MAP_SHARED */
  79 #define RAM_SHARED     (1 << 1)
  80
  81 /* Only a portion of RAM (used_length) is actually used, and migrated.
  82  * This used_length size can change across reboots.
  83  */
  84 #define RAM_RESIZEABLE (1 << 2)
  85
  86 #endif
  87
  88 struct CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
  89 /* current CPU in the current thread. It is only valid inside
  90    cpu_exec() */
  91 DEFINE_TLS(CPUState *, current_cpu);
  92 /* 0 = Do not count executed instructions.
  93    1 = Precise instruction counting.
  94    2 = Adaptive rate instruction counting.  */
  95 int use_icount;
  96
  97 #if !defined(CONFIG_USER_ONLY)
  98
  99 typedef struct PhysPageEntry PhysPageEntry;
 100
 101 struct PhysPageEntry {
 102     /* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
 103     uint32_t skip : 6;
 104      /* index into phys_sections (!skip) or phys_map_nodes (skip) */
 105     uint32_t ptr : 26;
 106 };
 107
 108 #define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)
 109
 110 /* Size of the L2 (and L3, etc) page tables.  */
 111 #define ADDR_SPACE_BITS 64
 112
 113 #define P_L2_BITS 9
 114 #define P_L2_SIZE (1 << P_L2_BITS)
 115
 116 #define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
 117
 118 typedef PhysPageEntry Node[P_L2_SIZE];
 119
 120 typedef struct PhysPageMap {
 121     struct rcu_head rcu;
 122
 123     unsigned sections_nb;
 124     unsigned sections_nb_alloc;
 125     unsigned nodes_nb;
 126     unsigned nodes_nb_alloc;
 127     Node *nodes;
 128     MemoryRegionSection *sections;
 129 } PhysPageMap;
 130
 131 struct AddressSpaceDispatch {
 132     struct rcu_head rcu;
 133
 134     /* This is a multi-level map on the physical address space.
 135      * The bottom level has pointers to MemoryRegionSections.
 136      */
 137     PhysPageEntry phys_map;
 138     PhysPageMap map;
 139     AddressSpace *as;
 140 };
 141
 142 #define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
 143 typedef struct subpage_t {
 144     MemoryRegion iomem;
 145     AddressSpace *as;
 146     hwaddr base;
 147     uint16_t sub_section[TARGET_PAGE_SIZE];
 148 } subpage_t;
 149
 150 #define PHYS_SECTION_UNASSIGNED 0
 151 #define PHYS_SECTION_NOTDIRTY 1
 152 #define PHYS_SECTION_ROM 2
 153 #define PHYS_SECTION_WATCH 3
 154
 155 static void io_mem_init(void);
 156 static void memory_map_init(void);
 157 static void tcg_commit(MemoryListener *listener);
 158
 159 static MemoryRegion io_mem_watch;
 160 #endif
 161
 162 #if !defined(CONFIG_USER_ONLY)
 163
 164 static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
 165 {
 166     if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
 167         map->nodes_nb_alloc = MAX(map->nodes_nb_alloc * 2, 16);
 168         map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, map->nodes_nb + nodes);
 169         map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
 170     }
 171 }
 172
 173 static uint32_t phys_map_node_alloc(PhysPageMap *map)
 174 {
 175     unsigned i;
 176     uint32_t ret;
 177
 178     ret = map->nodes_nb++;
 179     assert(ret != PHYS_MAP_NODE_NIL);
 180     assert(ret != map->nodes_nb_alloc);
 181     for (i = 0; i < P_L2_SIZE; ++i) {
 182         map->nodes[ret][i].skip = 1;
 183         map->nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
 184     }
 185     return ret;
 186 }
 187
 188 static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp,
 189                                 hwaddr *index, hwaddr *nb, uint16_t leaf,
 190                                 int level)
 191 {
 192     PhysPageEntry *p;
 193     int i;
 194     hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
 195
 196     if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
 197         lp->ptr = phys_map_node_alloc(map);
 198         p = map->nodes[lp->ptr];
 199         if (level == 0) {
 200             for (i = 0; i < P_L2_SIZE; i++) {
 201                 p[i].skip = 0;
 202                 p[i].ptr = PHYS_SECTION_UNASSIGNED;
 203             }
 204         }
 205     } else {
 206         p = map->nodes[lp->ptr];
 207     }
 208     lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)];
 209
 210     while (*nb && lp < &p[P_L2_SIZE]) {
 211         if ((*index & (step - 1)) == 0 && *nb >= step) {
 212             lp->skip = 0;
 213             lp->ptr = leaf;
 214             *index += step;
 215             *nb -= step;
 216         } else {
 217             phys_page_set_level(map, lp, index, nb, leaf, level - 1);
 218         }
 219         ++lp;
 220     }
 221 }
 222
 223 static void phys_page_set(AddressSpaceDispatch *d,
 224                           hwaddr index, hwaddr nb,
 225                           uint16_t leaf)
 226 {
 227     /* Wildly overreserve - it doesn't matter much. */
 228     phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
 229
 230     phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
 231 }
 232
 233 /* Compact a non leaf page entry. Simply detect that the entry has a single child,
 234  * and update our entry so we can skip it and go directly to the destination.
 235  */
 236 static void phys_page_compact(PhysPageEntry *lp, Node *nodes, unsigned long *compacted)
 237 {
 238     unsigned valid_ptr = P_L2_SIZE;
 239     int valid = 0;
 240     PhysPageEntry *p;
 241     int i;
 242
 243     if (lp->ptr == PHYS_MAP_NODE_NIL) {
 244         return;
 245     }
 246
 247     p = nodes[lp->ptr];
 248     for (i = 0; i < P_L2_SIZE; i++) {
 249         if (p[i].ptr == PHYS_MAP_NODE_NIL) {
 250             continue;
 251         }
 252
 253         valid_ptr = i;
 254         valid++;
 255         if (p[i].skip) {
 256             phys_page_compact(&p[i], nodes, compacted);
 257         }
 258     }
 259
 260     /* We can only compress if there's only one child. */
 261     if (valid != 1) {
 262         return;
 263     }
 264
 265     assert(valid_ptr < P_L2_SIZE);
 266
 267     /* Don't compress if it won't fit in the # of bits we have. */
 268     if (lp->skip + p[valid_ptr].skip >= (1 << 3)) {
 269         return;
 270     }
 271
 272     lp->ptr = p[valid_ptr].ptr;
 273     if (!p[valid_ptr].skip) {
 274         /* If our only child is a leaf, make this a leaf. */
 275         /* By design, we should have made this node a leaf to begin with so we
 276          * should never reach here.
 277          * But since it's so simple to handle this, let's do it just in case we
 278          * change this rule.
 279          */
 280         lp->skip = 0;
 281     } else {
 282         lp->skip += p[valid_ptr].skip;
 283     }
 284 }
 285
 286 static void phys_page_compact_all(AddressSpaceDispatch *d, int nodes_nb)
 287 {
 288     DECLARE_BITMAP(compacted, nodes_nb);
 289
 290     if (d->phys_map.skip) {
 291         phys_page_compact(&d->phys_map, d->map.nodes, compacted);
 292     }
 293 }
 294
 295 static MemoryRegionSection *phys_page_find(PhysPageEntry lp, hwaddr addr,
 296                                            Node *nodes, MemoryRegionSection *sections)
 297 {
 298     PhysPageEntry *p;
 299     hwaddr index = addr >> TARGET_PAGE_BITS;
 300     int i;
 301
 302     for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
 303         if (lp.ptr == PHYS_MAP_NODE_NIL) {
 304             return &sections[PHYS_SECTION_UNASSIGNED];
 305         }
 306         p = nodes[lp.ptr];
 307         lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
 308     }
 309
 310     if (sections[lp.ptr].size.hi ||
 311         range_covers_byte(sections[lp.ptr].offset_within_address_space,
 312                           sections[lp.ptr].size.lo, addr)) {
 313         return &sections[lp.ptr];
 314     } else {
 315         return &sections[PHYS_SECTION_UNASSIGNED];
 316     }
 317 }
 318
 319 bool memory_region_is_unassigned(MemoryRegion *mr)
 320 {
 321     return mr != &io_mem_rom && mr != &io_mem_notdirty && !mr->rom_device
 322         && mr != &io_mem_watch;
 323 }
 324
 325 /* Called from RCU critical section */
 326 static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
 327                                                         hwaddr addr,
 328                                                         bool resolve_subpage)
 329 {
 330     MemoryRegionSection *section;
 331     subpage_t *subpage;
 332
 333     section = phys_page_find(d->phys_map, addr, d->map.nodes, d->map.sections);
 334     if (resolve_subpage && section->mr->subpage) {
 335         subpage = container_of(section->mr, subpage_t, iomem);
 336         section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
 337     }
 338     return section;
 339 }
 340
 341 /* Called from RCU critical section */
 342 static MemoryRegionSection *
 343 address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *xlat,
 344                                  hwaddr *plen, bool resolve_subpage)
 345 {
 346     MemoryRegionSection *section;
 347     Int128 diff;
 348
 349     section = address_space_lookup_region(d, addr, resolve_subpage);
 350     /* Compute offset within MemoryRegionSection */
 351     addr -= section->offset_within_address_space;
 352
 353     /* Compute offset within MemoryRegion */
 354     *xlat = addr + section->offset_within_region;
 355
 356     diff = int128_sub(section->mr->size, int128_make64(addr));
 357     *plen = int128_get64(int128_min(diff, int128_make64(*plen)));
 358     return section;
 359 }
 360
 361 static inline bool memory_access_is_direct(MemoryRegion *mr, bool is_write)
 362 {
 363     if (memory_region_is_ram(mr)) {
 364         return !(is_write && mr->readonly);
 365     }
 366     if (memory_region_is_romd(mr)) {
 367         return !is_write;
 368     }
 369
 370     return false;
 371 }
 372
 373 MemoryRegion *address_space_translate(AddressSpace *as, hwaddr addr,
 374                                       hwaddr *xlat, hwaddr *plen,
 375                                       bool is_write)
 376 {
 377     IOMMUTLBEntry iotlb;
 378     MemoryRegionSection *section;
 379     MemoryRegion *mr;
 380     hwaddr len = *plen;
 381
 382     rcu_read_lock();
 383     for (;;) {
 384         AddressSpaceDispatch *d = atomic_rcu_read(&as->dispatch);
 385         section = address_space_translate_internal(d, addr, &addr, plen, true);
 386         mr = section->mr;
 387
 388         if (!mr->iommu_ops) {
 389             break;
 390         }
 391
 392         iotlb = mr->iommu_ops->translate(mr, addr, is_write);
 393         addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
 394                 | (addr & iotlb.addr_mask));
 395         len = MIN(len, (addr | iotlb.addr_mask) - addr + 1);
 396         if (!(iotlb.perm & (1 << is_write))) {
 397             mr = &io_mem_unassigned;
 398             break;
 399         }
 400
 401         as = iotlb.target_as;
 402     }
 403
 404     if (xen_enabled() && memory_access_is_direct(mr, is_write)) {
 405         hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
 406         len = MIN(page, len);
 407     }
 408
 409     *plen = len;
 410     *xlat = addr;
 411     rcu_read_unlock();
 412     return mr;
 413 }
 414
 415 /* Called from RCU critical section */
 416 MemoryRegionSection *
 417 address_space_translate_for_iotlb(CPUState *cpu, hwaddr addr,
 418                                   hwaddr *xlat, hwaddr *plen)
 419 {
 420     MemoryRegionSection *section;
 421     section = address_space_translate_internal(cpu->memory_dispatch,
 422                                                addr, xlat, plen, false);
 423
 424     assert(!section->mr->iommu_ops);
 425     return section;
 426 }
 427 #endif
 428
 429 void cpu_exec_init_all(void)
 430 {
 431 #if !defined(CONFIG_USER_ONLY)
 432     qemu_mutex_init(&ram_list.mutex);
 433     memory_map_init();
 434     io_mem_init();
 435 #endif
 436 }
 437
 438 #if !defined(CONFIG_USER_ONLY)
 439
 440 static int cpu_common_post_load(void *opaque, int version_id)
 441 {
 442     CPUState *cpu = opaque;
 443
 444     /* 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the
 445        version_id is increased. */
 446     cpu->interrupt_request &= ~0x01;
 447     tlb_flush(cpu, 1);
 448
 449     return 0;
 450 }
 451
 452 static int cpu_common_pre_load(void *opaque)
 453 {
 454     CPUState *cpu = opaque;
 455
 456     cpu->exception_index = -1;
 457
 458     return 0;
 459 }
 460
 461 static bool cpu_common_exception_index_needed(void *opaque)
 462 {
 463     CPUState *cpu = opaque;
 464
 465     return tcg_enabled() && cpu->exception_index != -1;
 466 }
 467
 468 static const VMStateDescription vmstate_cpu_common_exception_index = {
 469     .name = "cpu_common/exception_index",
 470     .version_id = 1,
 471     .minimum_version_id = 1,
 472     .fields = (VMStateField[]) {
 473         VMSTATE_INT32(exception_index, CPUState),
 474         VMSTATE_END_OF_LIST()
 475     }
 476 };
 477
 478 const VMStateDescription vmstate_cpu_common = {
 479     .name = "cpu_common",
 480     .version_id = 1,
 481     .minimum_version_id = 1,
 482     .pre_load = cpu_common_pre_load,
 483     .post_load = cpu_common_post_load,
 484     .fields = (VMStateField[]) {
 485         VMSTATE_UINT32(halted, CPUState),
 486         VMSTATE_UINT32(interrupt_request, CPUState),
 487         VMSTATE_END_OF_LIST()
 488     },
 489     .subsections = (VMStateSubsection[]) {
 490         {
 491             .vmsd = &vmstate_cpu_common_exception_index,
 492             .needed = cpu_common_exception_index_needed,
 493         } , {
 494             /* empty */
 495         }
 496     }
 497 };
 498
 499 #endif
 500
 501 CPUState *qemu_get_cpu(int index)
 502 {
 503     CPUState *cpu;
 504
 505     CPU_FOREACH(cpu) {
 506         if (cpu->cpu_index == index) {
 507             return cpu;
 508         }
 509     }
 510
 511     return NULL;
 512 }
 513
 514 #if !defined(CONFIG_USER_ONLY)
 515 void tcg_cpu_address_space_init(CPUState *cpu, AddressSpace *as)
 516 {
 517     /* We only support one address space per cpu at the moment.  */
 518     assert(cpu->as == as);
 519
 520     if (cpu->tcg_as_listener) {
 521         memory_listener_unregister(cpu->tcg_as_listener);
 522     } else {
 523         cpu->tcg_as_listener = g_new0(MemoryListener, 1);
 524     }
 525     cpu->tcg_as_listener->commit = tcg_commit;
 526     memory_listener_register(cpu->tcg_as_listener, as);
 527 }
 528 #endif
 529
 530 void cpu_exec_init(CPUArchState *env)
 531 {
 532     CPUState *cpu = ENV_GET_CPU(env);
 533     CPUClass *cc = CPU_GET_CLASS(cpu);
 534     CPUState *some_cpu;
 535     int cpu_index;
 536
 537 #if defined(CONFIG_USER_ONLY)
 538     cpu_list_lock();
 539 #endif
 540     cpu_index = 0;
 541     CPU_FOREACH(some_cpu) {
 542         cpu_index++;
 543     }
 544     cpu->cpu_index = cpu_index;
 545     cpu->numa_node = 0;
 546     QTAILQ_INIT(&cpu->breakpoints);
 547     QTAILQ_INIT(&cpu->watchpoints);
 548 #ifndef CONFIG_USER_ONLY
 549     cpu->as = &address_space_memory;
 550     cpu->thread_id = qemu_get_thread_id();
 551     cpu_reload_memory_map(cpu);
 552 #endif
 553     QTAILQ_INSERT_TAIL(&cpus, cpu, node);
 554 #if defined(CONFIG_USER_ONLY)
 555     cpu_list_unlock();
 556 #endif
 557     if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
 558         vmstate_register(NULL, cpu_index, &vmstate_cpu_common, cpu);
 559     }
 560 #if defined(CPU_SAVE_VERSION) && !defined(CONFIG_USER_ONLY)
 561     register_savevm(NULL, "cpu", cpu_index, CPU_SAVE_VERSION,
 562                     cpu_save, cpu_load, env);
 563     assert(cc->vmsd == NULL);
 564     assert(qdev_get_vmsd(DEVICE(cpu)) == NULL);
 565 #endif
 566     if (cc->vmsd != NULL) {
 567         vmstate_register(NULL, cpu_index, cc->vmsd, cpu);
 568     }
 569 }
 570
 571 #if defined(CONFIG_USER_ONLY)
 572 static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
 573 {
 574     tb_invalidate_phys_page_range(pc, pc + 1, 0);
 575 }
 576 #else
 577 static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
 578 {
 579     hwaddr phys = cpu_get_phys_page_debug(cpu, pc);
 580     if (phys != -1) {
 581         tb_invalidate_phys_addr(cpu->as,
 582                                 phys | (pc & ~TARGET_PAGE_MASK));
 583     }
 584 }
 585 #endif
 586
 587 #if defined(CONFIG_USER_ONLY)
 588 void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
 589
 590 {
 591 }
 592
 593 int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
 594                           int flags)
 595 {
 596     return -ENOSYS;
 597 }
 598
 599 void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
 600 {
 601 }
 602
 603 int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
 604                           int flags, CPUWatchpoint **watchpoint)
 605 {
 606     return -ENOSYS;
 607 }
 608 #else
 609 /* Add a watchpoint.  */
 610 int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
 611                           int flags, CPUWatchpoint **watchpoint)
 612 {
 613     CPUWatchpoint *wp;
 614
 615     /* forbid ranges which are empty or run off the end of the address space */
 616     if (len == 0 || (addr + len - 1) < addr) {
 617         error_report("tried to set invalid watchpoint at %"
 618                      VADDR_PRIx ", len=%" VADDR_PRIu, addr, len);
 619         return -EINVAL;
 620     }
 621     wp = g_malloc(sizeof(*wp));
 622
 623     wp->vaddr = addr;
 624     wp->len = len;
 625     wp->flags = flags;
 626
 627     /* keep all GDB-injected watchpoints in front */
 628     if (flags & BP_GDB) {
 629         QTAILQ_INSERT_HEAD(&cpu->watchpoints, wp, entry);
 630     } else {
 631         QTAILQ_INSERT_TAIL(&cpu->watchpoints, wp, entry);
 632     }
 633
 634     tlb_flush_page(cpu, addr);
 635
 636     if (watchpoint)
 637         *watchpoint = wp;
 638     return 0;
 639 }
 640
 641 /* Remove a specific watchpoint.  */
 642 int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
 643                           int flags)
 644 {
 645     CPUWatchpoint *wp;
 646
 647     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
 648         if (addr == wp->vaddr && len == wp->len
 649                 && flags == (wp->flags & ~BP_WATCHPOINT_HIT)) {
 650             cpu_watchpoint_remove_by_ref(cpu, wp);
 651             return 0;
 652         }
 653     }
 654     return -ENOENT;
 655 }
 656
 657 /* Remove a specific watchpoint by reference.  */
 658 void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
 659 {
 660     QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry);
 661
 662     tlb_flush_page(cpu, watchpoint->vaddr);
 663
 664     g_free(watchpoint);
 665 }
 666
 667 /* Remove all matching watchpoints.  */
 668 void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
 669 {
 670     CPUWatchpoint *wp, *next;
 671
 672     QTAILQ_FOREACH_SAFE(wp, &cpu->watchpoints, entry, next) {
 673         if (wp->flags & mask) {
 674             cpu_watchpoint_remove_by_ref(cpu, wp);
 675         }
 676     }
 677 }
 678
 679 /* Return true if this watchpoint address matches the specified
 680  * access (ie the address range covered by the watchpoint overlaps
 681  * partially or completely with the address range covered by the
 682  * access).
 683  */
 684 static inline bool cpu_watchpoint_address_matches(CPUWatchpoint *wp,
 685                                                   vaddr addr,
 686                                                   vaddr len)
 687 {
 688     /* We know the lengths are non-zero, but a little caution is
 689      * required to avoid errors in the case where the range ends
 690      * exactly at the top of the address space and so addr + len
 691      * wraps round to zero.
 692      */
 693     vaddr wpend = wp->vaddr + wp->len - 1;
 694     vaddr addrend = addr + len - 1;
 695
 696     return !(addr > wpend || wp->vaddr > addrend);
 697 }
 698
 699 #endif
 700
 701 /* Add a breakpoint.  */
 702 int cpu_breakpoint_insert(CPUState *cpu, vaddr pc, int flags,
 703                           CPUBreakpoint **breakpoint)
 704 {
 705     CPUBreakpoint *bp;
 706
 707     bp = g_malloc(sizeof(*bp));
 708
 709     bp->pc = pc;
 710     bp->flags = flags;
 711
 712     /* keep all GDB-injected breakpoints in front */
 713     if (flags & BP_GDB) {
 714         QTAILQ_INSERT_HEAD(&cpu->breakpoints, bp, entry);
 715     } else {
 716         QTAILQ_INSERT_TAIL(&cpu->breakpoints, bp, entry);
 717     }
 718
 719     breakpoint_invalidate(cpu, pc);
 720
 721     if (breakpoint) {
 722         *breakpoint = bp;
 723     }
 724     return 0;
 725 }
 726
 727 /* Remove a specific breakpoint.  */
 728 int cpu_breakpoint_remove(CPUState *cpu, vaddr pc, int flags)
 729 {
 730     CPUBreakpoint *bp;
 731
 732     QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
 733         if (bp->pc == pc && bp->flags == flags) {
 734             cpu_breakpoint_remove_by_ref(cpu, bp);
 735             return 0;
 736         }
 737     }
 738     return -ENOENT;
 739 }
 740
 741 /* Remove a specific breakpoint by reference.  */
 742 void cpu_breakpoint_remove_by_ref(CPUState *cpu, CPUBreakpoint *breakpoint)
 743 {
 744     QTAILQ_REMOVE(&cpu->breakpoints, breakpoint, entry);
 745
 746     breakpoint_invalidate(cpu, breakpoint->pc);
 747
 748     g_free(breakpoint);
 749 }
 750
 751 /* Remove all matching breakpoints. */
 752 void cpu_breakpoint_remove_all(CPUState *cpu, int mask)
 753 {
 754     CPUBreakpoint *bp, *next;
 755
 756     QTAILQ_FOREACH_SAFE(bp, &cpu->breakpoints, entry, next) {
 757         if (bp->flags & mask) {
 758             cpu_breakpoint_remove_by_ref(cpu, bp);
 759         }
 760     }
 761 }
 762
 763 /* enable or disable single step mode. EXCP_DEBUG is returned by the
 764    CPU loop after each instruction */
 765 void cpu_single_step(CPUState *cpu, int enabled)
 766 {
 767     if (cpu->singlestep_enabled != enabled) {
 768         cpu->singlestep_enabled = enabled;
 769         if (kvm_enabled()) {
 770             kvm_update_guest_debug(cpu, 0);
 771         } else {
 772             /* must flush all the translated code to avoid inconsistencies */
 773             /* XXX: only flush what is necessary */
 774             CPUArchState *env = cpu->env_ptr;
 775             tb_flush(env);
 776         }
 777     }
 778 }
 779
 780 void cpu_abort(CPUState *cpu, const char *fmt, ...)
 781 {
 782     va_list ap;
 783     va_list ap2;
 784
 785     va_start(ap, fmt);
 786     va_copy(ap2, ap);
 787     fprintf(stderr, "qemu: fatal: ");
 788     vfprintf(stderr, fmt, ap);
 789     fprintf(stderr, "\n");
 790     cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU | CPU_DUMP_CCOP);
 791     if (qemu_log_enabled()) {
 792         qemu_log("qemu: fatal: ");
 793         qemu_log_vprintf(fmt, ap2);
 794         qemu_log("\n");
 795         log_cpu_state(cpu, CPU_DUMP_FPU | CPU_DUMP_CCOP);
 796         qemu_log_flush();
 797         qemu_log_close();
 798     }
 799     va_end(ap2);
 800     va_end(ap);
 801 #if defined(CONFIG_USER_ONLY)
 802     {
 803         struct sigaction act;
 804         sigfillset(&act.sa_mask);
 805         act.sa_handler = SIG_DFL;
 806         sigaction(SIGABRT, &act, NULL);
 807     }
 808 #endif
 809     abort();
 810 }
 811
 812 #if !defined(CONFIG_USER_ONLY)
 813 /* Called from RCU critical section */
 814 static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
 815 {
 816     RAMBlock *block;
 817
 818     block = atomic_rcu_read(&ram_list.mru_block);
 819     if (block && addr - block->offset < block->max_length) {
 820         goto found;
 821     }
 822     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 823         if (addr - block->offset < block->max_length) {
 824             goto found;
 825         }
 826     }
 827
 828     fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
 829     abort();
 830
 831 found:
 832     /* It is safe to write mru_block outside the iothread lock.  This
 833      * is what happens:
 834      *
 835      *     mru_block = xxx
 836      *     rcu_read_unlock()
 837      *                                        xxx removed from list
 838      *                  rcu_read_lock()
 839      *                  read mru_block
 840      *                                        mru_block = NULL;
 841      *                                        call_rcu(reclaim_ramblock, xxx);
 842      *                  rcu_read_unlock()
 843      *
 844      * atomic_rcu_set is not needed here.  The block was already published
 845      * when it was placed into the list.  Here we're just making an extra
 846      * copy of the pointer.
 847      */
 848     ram_list.mru_block = block;
 849     return block;
 850 }
 851
 852 static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
 853 {
 854     ram_addr_t start1;
 855     RAMBlock *block;
 856     ram_addr_t end;
 857
 858     end = TARGET_PAGE_ALIGN(start + length);
 859     start &= TARGET_PAGE_MASK;
 860
 861     rcu_read_lock();
 862     block = qemu_get_ram_block(start);
 863     assert(block == qemu_get_ram_block(end - 1));
 864     start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
 865     cpu_tlb_reset_dirty_all(start1, length);
 866     rcu_read_unlock();
 867 }
 868
 869 /* Note: start and end must be within the same ram block.  */
 870 void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t length,
 871                                      unsigned client)
 872 {
 873     if (length == 0)
 874         return;
 875     cpu_physical_memory_clear_dirty_range_type(start, length, client);
 876
 877     if (tcg_enabled()) {
 878         tlb_reset_dirty_range_all(start, length);
 879     }
 880 }
 881
 882 static void cpu_physical_memory_set_dirty_tracking(bool enable)
 883 {
 884     in_migration = enable;
 885 }
 886
 887 /* Called from RCU critical section */
 888 hwaddr memory_region_section_get_iotlb(CPUState *cpu,
 889                                        MemoryRegionSection *section,
 890                                        target_ulong vaddr,
 891                                        hwaddr paddr, hwaddr xlat,
 892                                        int prot,
 893                                        target_ulong *address)
 894 {
 895     hwaddr iotlb;
 896     CPUWatchpoint *wp;
 897
 898     if (memory_region_is_ram(section->mr)) {
 899         /* Normal RAM.  */
 900         iotlb = (memory_region_get_ram_addr(section->mr) & TARGET_PAGE_MASK)
 901             + xlat;
 902         if (!section->readonly) {
 903             iotlb |= PHYS_SECTION_NOTDIRTY;
 904         } else {
 905             iotlb |= PHYS_SECTION_ROM;
 906         }
 907     } else {
 908         iotlb = section - section->address_space->dispatch->map.sections;
 909         iotlb += xlat;
 910     }
 911
 912     /* Make accesses to pages with watchpoints go via the
 913        watchpoint trap routines.  */
 914     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
 915         if (cpu_watchpoint_address_matches(wp, vaddr, TARGET_PAGE_SIZE)) {
 916             /* Avoid trapping reads of pages with a write breakpoint. */
 917             if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) {
 918                 iotlb = PHYS_SECTION_WATCH + paddr;
 919                 *address |= TLB_MMIO;
 920                 break;
 921             }
 922         }
 923     }
 924
 925     return iotlb;
 926 }
 927 #endif /* defined(CONFIG_USER_ONLY) */
 928
 929 #if !defined(CONFIG_USER_ONLY)
 930
 931 static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
 932                              uint16_t section);
 933 static subpage_t *subpage_init(AddressSpace *as, hwaddr base);
 934
 935 static void *(*phys_mem_alloc)(size_t size, uint64_t *align) =
 936                                qemu_anon_ram_alloc;
 937
 938 /*
 939  * Set a custom physical guest memory alloator.
 940  * Accelerators with unusual needs may need this.  Hopefully, we can
 941  * get rid of it eventually.
 942  */
 943 void phys_mem_set_alloc(void *(*alloc)(size_t, uint64_t *align))
 944 {
 945     phys_mem_alloc = alloc;
 946 }
 947
 948 static uint16_t phys_section_add(PhysPageMap *map,
 949                                  MemoryRegionSection *section)
 950 {
 951     /* The physical section number is ORed with a page-aligned
 952      * pointer to produce the iotlb entries.  Thus it should
 953      * never overflow into the page-aligned value.
 954      */
 955     assert(map->sections_nb < TARGET_PAGE_SIZE);
 956
 957     if (map->sections_nb == map->sections_nb_alloc) {
 958         map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
 959         map->sections = g_renew(MemoryRegionSection, map->sections,
 960                                 map->sections_nb_alloc);
 961     }
 962     map->sections[map->sections_nb] = *section;
 963     memory_region_ref(section->mr);
 964     return map->sections_nb++;
 965 }
 966
 967 static void phys_section_destroy(MemoryRegion *mr)
 968 {
 969     memory_region_unref(mr);
 970
 971     if (mr->subpage) {
 972         subpage_t *subpage = container_of(mr, subpage_t, iomem);
 973         object_unref(OBJECT(&subpage->iomem));
 974         g_free(subpage);
 975     }
 976 }
 977
 978 static void phys_sections_free(PhysPageMap *map)
 979 {
 980     while (map->sections_nb > 0) {
 981         MemoryRegionSection *section = &map->sections[--map->sections_nb];
 982         phys_section_destroy(section->mr);
 983     }
 984     g_free(map->sections);
 985     g_free(map->nodes);
 986 }
 987
 988 static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *section)
 989 {
 990     subpage_t *subpage;
 991     hwaddr base = section->offset_within_address_space
 992         & TARGET_PAGE_MASK;
 993     MemoryRegionSection *existing = phys_page_find(d->phys_map, base,
 994                                                    d->map.nodes, d->map.sections);
 995     MemoryRegionSection subsection = {
 996         .offset_within_address_space = base,
 997         .size = int128_make64(TARGET_PAGE_SIZE),
 998     };
 999     hwaddr start, end;
1000
1001     assert(existing->mr->subpage || existing->mr == &io_mem_unassigned);
1002
1003     if (!(existing->mr->subpage)) {
1004         subpage = subpage_init(d->as, base);
1005         subsection.address_space = d->as;
1006         subsection.mr = &subpage->iomem;
1007         phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
1008                       phys_section_add(&d->map, &subsection));
1009     } else {
1010         subpage = container_of(existing->mr, subpage_t, iomem);
1011     }
1012     start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
1013     end = start + int128_get64(section->size) - 1;
1014     subpage_register(subpage, start, end,
1015                      phys_section_add(&d->map, section));
1016 }
1017
1018
1019 static void register_multipage(AddressSpaceDispatch *d,
1020                                MemoryRegionSection *section)
1021 {
1022     hwaddr start_addr = section->offset_within_address_space;
1023     uint16_t section_index = phys_section_add(&d->map, section);
1024     uint64_t num_pages = int128_get64(int128_rshift(section->size,
1025                                                     TARGET_PAGE_BITS));
1026
1027     assert(num_pages);
1028     phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
1029 }
1030
1031 static void mem_add(MemoryListener *listener, MemoryRegionSection *section)
1032 {
1033     AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
1034     AddressSpaceDispatch *d = as->next_dispatch;
1035     MemoryRegionSection now = *section, remain = *section;
1036     Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
1037
1038     if (now.offset_within_address_space & ~TARGET_PAGE_MASK) {
1039         uint64_t left = TARGET_PAGE_ALIGN(now.offset_within_address_space)
1040                        - now.offset_within_address_space;
1041
1042         now.size = int128_min(int128_make64(left), now.size);
1043         register_subpage(d, &now);
1044     } else {
1045         now.size = int128_zero();
1046     }
1047     while (int128_ne(remain.size, now.size)) {
1048         remain.size = int128_sub(remain.size, now.size);
1049         remain.offset_within_address_space += int128_get64(now.size);
1050         remain.offset_within_region += int128_get64(now.size);
1051         now = remain;
1052         if (int128_lt(remain.size, page_size)) {
1053             register_subpage(d, &now);
1054         } else if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
1055             now.size = page_size;
1056             register_subpage(d, &now);
1057         } else {
1058             now.size = int128_and(now.size, int128_neg(page_size));
1059             register_multipage(d, &now);
1060         }
1061     }
1062 }
1063
1064 void qemu_flush_coalesced_mmio_buffer(void)
1065 {
1066     if (kvm_enabled())
1067         kvm_flush_coalesced_mmio_buffer();
1068 }
1069
1070 void qemu_mutex_lock_ramlist(void)
1071 {
1072     qemu_mutex_lock(&ram_list.mutex);
1073 }
1074
1075 void qemu_mutex_unlock_ramlist(void)
1076 {
1077     qemu_mutex_unlock(&ram_list.mutex);
1078 }
1079
1080 #ifdef __linux__
1081
1082 #include <sys/vfs.h>
1083
1084 #define HUGETLBFS_MAGIC       0x958458f6
1085
1086 static long gethugepagesize(const char *path, Error **errp)
1087 {
1088     struct statfs fs;
1089     int ret;
1090
1091     do {
1092         ret = statfs(path, &fs);
1093     } while (ret != 0 && errno == EINTR);
1094
1095     if (ret != 0) {
1096         error_setg_errno(errp, errno, "failed to get page size of file %s",
1097                          path);
1098         return 0;
1099     }
1100
1101     if (fs.f_type != HUGETLBFS_MAGIC)
1102         fprintf(stderr, "Warning: path not on HugeTLBFS: %s\n", path);
1103
1104     return fs.f_bsize;
1105 }
1106
1107 static void *file_ram_alloc(RAMBlock *block,
1108                             ram_addr_t memory,
1109                             const char *path,
1110                             Error **errp)
1111 {
1112     char *filename;
1113     char *sanitized_name;
1114     char *c;
1115     void *area = NULL;
1116     int fd;
1117     uint64_t hpagesize;
1118     Error *local_err = NULL;
1119
1120     hpagesize = gethugepagesize(path, &local_err);
1121     if (local_err) {
1122         error_propagate(errp, local_err);
1123         goto error;
1124     }
1125     block->mr->align = hpagesize;
1126
1127     if (memory < hpagesize) {
1128         error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
1129                    "or larger than huge page size 0x%" PRIx64,
1130                    memory, hpagesize);
1131         goto error;
1132     }
1133
1134     if (kvm_enabled() && !kvm_has_sync_mmu()) {
1135         error_setg(errp,
1136                    "host lacks kvm mmu notifiers, -mem-path unsupported");
1137         goto error;
1138     }
1139
1140     /* Make name safe to use with mkstemp by replacing '/' with '_'. */
1141     sanitized_name = g_strdup(memory_region_name(block->mr));
1142     for (c = sanitized_name; *c != '\0'; c++) {
1143         if (*c == '/')
1144             *c = '_';
1145     }
1146
1147     filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
1148                                sanitized_name);
1149     g_free(sanitized_name);
1150
1151     fd = mkstemp(filename);
1152     if (fd < 0) {
1153         error_setg_errno(errp, errno,
1154                          "unable to create backing store for hugepages");
1155         g_free(filename);
1156         goto error;
1157     }
1158     unlink(filename);
1159     g_free(filename);
1160
1161     memory = (memory+hpagesize-1) & ~(hpagesize-1);
1162
1163     /*
1164      * ftruncate is not supported by hugetlbfs in older
1165      * hosts, so don't bother bailing out on errors.
1166      * If anything goes wrong with it under other filesystems,
1167      * mmap will fail.
1168      */
1169     if (ftruncate(fd, memory)) {
1170         perror("ftruncate");
1171     }
1172
1173     area = mmap(0, memory, PROT_READ | PROT_WRITE,
1174                 (block->flags & RAM_SHARED ? MAP_SHARED : MAP_PRIVATE),
1175                 fd, 0);
1176     if (area == MAP_FAILED) {
1177         error_setg_errno(errp, errno,
1178                          "unable to map backing store for hugepages");
1179         close(fd);
1180         goto error;
1181     }
1182
1183     if (mem_prealloc) {
1184         os_mem_prealloc(fd, area, memory);
1185     }
1186
1187     block->fd = fd;
1188     return area;
1189
1190 error:
1191     if (mem_prealloc) {
1192         error_report("%s", error_get_pretty(*errp));
1193         exit(1);
1194     }
1195     return NULL;
1196 }
1197 #endif
1198
1199 /* Called with the ramlist lock held.  */
1200 static ram_addr_t find_ram_offset(ram_addr_t size)
1201 {
1202     RAMBlock *block, *next_block;
1203     ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
1204
1205     assert(size != 0); /* it would hand out same offset multiple times */
1206
1207     if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
1208         return 0;
1209     }
1210
1211     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1212         ram_addr_t end, next = RAM_ADDR_MAX;
1213
1214         end = block->offset + block->max_length;
1215
1216         QLIST_FOREACH_RCU(next_block, &ram_list.blocks, next) {
1217             if (next_block->offset >= end) {
1218                 next = MIN(next, next_block->offset);
1219             }
1220         }
1221         if (next - end >= size && next - end < mingap) {
1222             offset = end;
1223             mingap = next - end;
1224         }
1225     }
1226
1227     if (offset == RAM_ADDR_MAX) {
1228         fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
1229                 (uint64_t)size);
1230         abort();
1231     }
1232
1233     return offset;
1234 }
1235
1236 ram_addr_t last_ram_offset(void)
1237 {
1238     RAMBlock *block;
1239     ram_addr_t last = 0;
1240
1241     rcu_read_lock();
1242     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1243         last = MAX(last, block->offset + block->max_length);
1244     }
1245     rcu_read_unlock();
1246     return last;
1247 }
1248
1249 static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
1250 {
1251     int ret;
1252
1253     /* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
1254     if (!qemu_opt_get_bool(qemu_get_machine_opts(),
1255                            "dump-guest-core", true)) {
1256         ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
1257         if (ret) {
1258             perror("qemu_madvise");
1259             fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
1260                             "but dump_guest_core=off specified\n");
1261         }
1262     }
1263 }
1264
1265 /* Called within an RCU critical section, or while the ramlist lock
1266  * is held.
1267  */
1268 static RAMBlock *find_ram_block(ram_addr_t addr)
1269 {
1270     RAMBlock *block;
1271
1272     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1273         if (block->offset == addr) {
1274             return block;
1275         }
1276     }
1277
1278     return NULL;
1279 }
1280
1281 /* Called with iothread lock held.  */
1282 void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev)
1283 {
1284     RAMBlock *new_block, *block;
1285
1286     rcu_read_lock();
1287     new_block = find_ram_block(addr);
1288     assert(new_block);
1289     assert(!new_block->idstr[0]);
1290
1291     if (dev) {
1292         char *id = qdev_get_dev_path(dev);
1293         if (id) {
1294             snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
1295             g_free(id);
1296         }
1297     }
1298     pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
1299
1300     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1301         if (block != new_block && !strcmp(block->idstr, new_block->idstr)) {
1302             fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
1303                     new_block->idstr);
1304             abort();
1305         }
1306     }
1307     rcu_read_unlock();
1308 }
1309
1310 /* Called with iothread lock held.  */
1311 void qemu_ram_unset_idstr(ram_addr_t addr)
1312 {
1313     RAMBlock *block;
1314
1315     /* FIXME: arch_init.c assumes that this is not called throughout
1316      * migration.  Ignore the problem since hot-unplug during migration
1317      * does not work anyway.
1318      */
1319
1320     rcu_read_lock();
1321     block = find_ram_block(addr);
1322     if (block) {
1323         memset(block->idstr, 0, sizeof(block->idstr));
1324     }
1325     rcu_read_unlock();
1326 }
1327
1328 static int memory_try_enable_merging(void *addr, size_t len)
1329 {
1330     if (!qemu_opt_get_bool(qemu_get_machine_opts(), "mem-merge", true)) {
1331         /* disabled by the user */
1332         return 0;
1333     }
1334
1335     return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
1336 }
1337
1338 /* Only legal before guest might have detected the memory size: e.g. on
1339  * incoming migration, or right after reset.
1340  *
1341  * As memory core doesn't know how is memory accessed, it is up to
1342  * resize callback to update device state and/or add assertions to detect
1343  * misuse, if necessary.
1344  */
1345 int qemu_ram_resize(ram_addr_t base, ram_addr_t newsize, Error **errp)
1346 {
1347     RAMBlock *block = find_ram_block(base);
1348
1349     assert(block);
1350
1351     newsize = TARGET_PAGE_ALIGN(newsize);
1352
1353     if (block->used_length == newsize) {
1354         return 0;
1355     }
1356
1357     if (!(block->flags & RAM_RESIZEABLE)) {
1358         error_setg_errno(errp, EINVAL,
1359                          "Length mismatch: %s: 0x" RAM_ADDR_FMT
1360                          " in != 0x" RAM_ADDR_FMT, block->idstr,
1361                          newsize, block->used_length);
1362         return -EINVAL;
1363     }
1364
1365     if (block->max_length < newsize) {
1366         error_setg_errno(errp, EINVAL,
1367                          "Length too large: %s: 0x" RAM_ADDR_FMT
1368                          " > 0x" RAM_ADDR_FMT, block->idstr,
1369                          newsize, block->max_length);
1370         return -EINVAL;
1371     }
1372
1373     cpu_physical_memory_clear_dirty_range(block->offset, block->used_length);
1374     block->used_length = newsize;
1375     cpu_physical_memory_set_dirty_range(block->offset, block->used_length);
1376     memory_region_set_size(block->mr, newsize);
1377     if (block->resized) {
1378         block->resized(block->idstr, newsize, block->host);
1379     }
1380     return 0;
1381 }
1382
1383 static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp)
1384 {
1385     RAMBlock *block;
1386     RAMBlock *last_block = NULL;
1387     ram_addr_t old_ram_size, new_ram_size;
1388
1389     old_ram_size = last_ram_offset() >> TARGET_PAGE_BITS;
1390
1391     qemu_mutex_lock_ramlist();
1392     new_block->offset = find_ram_offset(new_block->max_length);
1393
1394     if (!new_block->host) {
1395         if (xen_enabled()) {
1396             xen_ram_alloc(new_block->offset, new_block->max_length,
1397                           new_block->mr);
1398         } else {
1399             new_block->host = phys_mem_alloc(new_block->max_length,
1400                                              &new_block->mr->align);
1401             if (!new_block->host) {
1402                 error_setg_errno(errp, errno,
1403                                  "cannot set up guest memory '%s'",
1404                                  memory_region_name(new_block->mr));
1405                 qemu_mutex_unlock_ramlist();
1406                 return -1;
1407             }
1408             memory_try_enable_merging(new_block->host, new_block->max_length);
1409         }
1410     }
1411
1412     /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
1413      * QLIST (which has an RCU-friendly variant) does not have insertion at
1414      * tail, so save the last element in last_block.
1415      */
1416     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1417         last_block = block;
1418         if (block->max_length < new_block->max_length) {
1419             break;
1420         }
1421     }
1422     if (block) {
1423         QLIST_INSERT_BEFORE_RCU(block, new_block, next);
1424     } else if (last_block) {
1425         QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
1426     } else { /* list is empty */
1427         QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
1428     }
1429     ram_list.mru_block = NULL;
1430
1431     /* Write list before version */
1432     smp_wmb();
1433     ram_list.version++;
1434     qemu_mutex_unlock_ramlist();
1435
1436     new_ram_size = last_ram_offset() >> TARGET_PAGE_BITS;
1437
1438     if (new_ram_size > old_ram_size) {
1439         int i;
1440
1441         /* ram_list.dirty_memory[] is protected by the iothread lock.  */
1442         for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
1443             ram_list.dirty_memory[i] =
1444                 bitmap_zero_extend(ram_list.dirty_memory[i],
1445                                    old_ram_size, new_ram_size);
1446        }
1447     }
1448     cpu_physical_memory_set_dirty_range(new_block->offset,
1449                                         new_block->used_length);
1450
1451     if (new_block->host) {
1452         qemu_ram_setup_dump(new_block->host, new_block->max_length);
1453         qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
1454         qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_DONTFORK);
1455         if (kvm_enabled()) {
1456             kvm_setup_guest_memory(new_block->host, new_block->max_length);
1457         }
1458     }
1459
1460     return new_block->offset;
1461 }
1462
1463 #ifdef __linux__
1464 ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
1465                                     bool share, const char *mem_path,
1466                                     Error **errp)
1467 {
1468     RAMBlock *new_block;
1469     ram_addr_t addr;
1470     Error *local_err = NULL;
1471
1472     if (xen_enabled()) {
1473         error_setg(errp, "-mem-path not supported with Xen");
1474         return -1;
1475     }
1476
1477     if (phys_mem_alloc != qemu_anon_ram_alloc) {
1478         /*
1479          * file_ram_alloc() needs to allocate just like
1480          * phys_mem_alloc, but we haven't bothered to provide
1481          * a hook there.
1482          */
1483         error_setg(errp,
1484                    "-mem-path not supported with this accelerator");
1485         return -1;
1486     }
1487
1488     size = TARGET_PAGE_ALIGN(size);
1489     new_block = g_malloc0(sizeof(*new_block));
1490     new_block->mr = mr;
1491     new_block->used_length = size;
1492     new_block->max_length = size;
1493     new_block->flags = share ? RAM_SHARED : 0;
1494     new_block->host = file_ram_alloc(new_block, size,
1495                                      mem_path, errp);
1496     if (!new_block->host) {
1497         g_free(new_block);
1498         return -1;
1499     }
1500
1501     addr = ram_block_add(new_block, &local_err);
1502     if (local_err) {
1503         g_free(new_block);
1504         error_propagate(errp, local_err);
1505         return -1;
1506     }
1507     return addr;
1508 }
1509 #endif
1510
1511 static
1512 ram_addr_t qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
1513                                    void (*resized)(const char*,
1514                                                    uint64_t length,
1515                                                    void *host),
1516                                    void *host, bool resizeable,
1517                                    MemoryRegion *mr, Error **errp)
1518 {
1519     RAMBlock *new_block;
1520     ram_addr_t addr;
1521     Error *local_err = NULL;
1522
1523     size = TARGET_PAGE_ALIGN(size);
1524     max_size = TARGET_PAGE_ALIGN(max_size);
1525     new_block = g_malloc0(sizeof(*new_block));
1526     new_block->mr = mr;
1527     new_block->resized = resized;
1528     new_block->used_length = size;
1529     new_block->max_length = max_size;
1530     assert(max_size >= size);
1531     new_block->fd = -1;
1532     new_block->host = host;
1533     if (host) {
1534         new_block->flags |= RAM_PREALLOC;
1535     }
1536     if (resizeable) {
1537         new_block->flags |= RAM_RESIZEABLE;
1538     }
1539     addr = ram_block_add(new_block, &local_err);
1540     if (local_err) {
1541         g_free(new_block);
1542         error_propagate(errp, local_err);
1543         return -1;
1544     }
1545     return addr;
1546 }
1547
1548 ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
1549                                    MemoryRegion *mr, Error **errp)
1550 {
1551     return qemu_ram_alloc_internal(size, size, NULL, host, false, mr, errp);
1552 }
1553
1554 ram_addr_t qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr, Error **errp)
1555 {
1556     return qemu_ram_alloc_internal(size, size, NULL, NULL, false, mr, errp);
1557 }
1558
1559 ram_addr_t qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
1560                                      void (*resized)(const char*,
1561                                                      uint64_t length,
1562                                                      void *host),
1563                                      MemoryRegion *mr, Error **errp)
1564 {
1565     return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true, mr, errp);
1566 }
1567
1568 void qemu_ram_free_from_ptr(ram_addr_t addr)
1569 {
1570     RAMBlock *block;
1571
1572     qemu_mutex_lock_ramlist();
1573     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1574         if (addr == block->offset) {
1575             QLIST_REMOVE_RCU(block, next);
1576             ram_list.mru_block = NULL;
1577             /* Write list before version */
1578             smp_wmb();
1579             ram_list.version++;
1580             g_free_rcu(block, rcu);
1581             break;
1582         }
1583     }
1584     qemu_mutex_unlock_ramlist();
1585 }
1586
1587 static void reclaim_ramblock(RAMBlock *block)
1588 {
1589     if (block->flags & RAM_PREALLOC) {
1590         ;
1591     } else if (xen_enabled()) {
1592         xen_invalidate_map_cache_entry(block->host);
1593 #ifndef _WIN32
1594     } else if (block->fd >= 0) {
1595         munmap(block->host, block->max_length);
1596         close(block->fd);
1597 #endif
1598     } else {
1599         qemu_anon_ram_free(block->host, block->max_length);
1600     }
1601     g_free(block);
1602 }
1603
1604 void qemu_ram_free(ram_addr_t addr)
1605 {
1606     RAMBlock *block;
1607
1608     qemu_mutex_lock_ramlist();
1609     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1610         if (addr == block->offset) {
1611             QLIST_REMOVE_RCU(block, next);
1612             ram_list.mru_block = NULL;
1613             /* Write list before version */
1614             smp_wmb();
1615             ram_list.version++;
1616             call_rcu(block, reclaim_ramblock, rcu);
1617             break;
1618         }
1619     }
1620     qemu_mutex_unlock_ramlist();
1621 }
1622
1623 #ifndef _WIN32
1624 void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
1625 {
1626     RAMBlock *block;
1627     ram_addr_t offset;
1628     int flags;
1629     void *area, *vaddr;
1630
1631     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1632         offset = addr - block->offset;
1633         if (offset < block->max_length) {
1634             vaddr = ramblock_ptr(block, offset);
1635             if (block->flags & RAM_PREALLOC) {
1636                 ;
1637             } else if (xen_enabled()) {
1638                 abort();
1639             } else {
1640                 flags = MAP_FIXED;
1641                 munmap(vaddr, length);
1642                 if (block->fd >= 0) {
1643                     flags |= (block->flags & RAM_SHARED ?
1644                               MAP_SHARED : MAP_PRIVATE);
1645                     area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
1646                                 flags, block->fd, offset);
1647                 } else {
1648                     /*
1649                      * Remap needs to match alloc.  Accelerators that
1650                      * set phys_mem_alloc never remap.  If they did,
1651                      * we'd need a remap hook here.
1652                      */
1653                     assert(phys_mem_alloc == qemu_anon_ram_alloc);
1654
1655                     flags |= MAP_PRIVATE | MAP_ANONYMOUS;
1656                     area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
1657                                 flags, -1, 0);
1658                 }
1659                 if (area != vaddr) {
1660                     fprintf(stderr, "Could not remap addr: "
1661                             RAM_ADDR_FMT "@" RAM_ADDR_FMT "\n",
1662                             length, addr);
1663                     exit(1);
1664                 }
1665                 memory_try_enable_merging(vaddr, length);
1666                 qemu_ram_setup_dump(vaddr, length);
1667             }
1668         }
1669     }
1670 }
1671 #endif /* !_WIN32 */
1672
1673 int qemu_get_ram_fd(ram_addr_t addr)
1674 {
1675     RAMBlock *block;
1676     int fd;
1677
1678     rcu_read_lock();
1679     block = qemu_get_ram_block(addr);
1680     fd = block->fd;
1681     rcu_read_unlock();
1682     return fd;
1683 }
1684
1685 void *qemu_get_ram_block_host_ptr(ram_addr_t addr)
1686 {
1687     RAMBlock *block;
1688     void *ptr;
1689
1690     rcu_read_lock();
1691     block = qemu_get_ram_block(addr);
1692     ptr = ramblock_ptr(block, 0);
1693     rcu_read_unlock();
1694     return ptr;
1695 }
1696
1697 /* Return a host pointer to ram allocated with qemu_ram_alloc.
1698  * This should not be used for general purpose DMA.  Use address_space_map
1699  * or address_space_rw instead. For local memory (e.g. video ram) that the
1700  * device owns, use memory_region_get_ram_ptr.
1701  *
1702  * By the time this function returns, the returned pointer is not protected
1703  * by RCU anymore.  If the caller is not within an RCU critical section and
1704  * does not hold the iothread lock, it must have other means of protecting the
1705  * pointer, such as a reference to the region that includes the incoming
1706  * ram_addr_t.
1707  */
1708 void *qemu_get_ram_ptr(ram_addr_t addr)
1709 {
1710     RAMBlock *block;
1711     void *ptr;
1712
1713     rcu_read_lock();
1714     block = qemu_get_ram_block(addr);
1715
1716     if (xen_enabled() && block->host == NULL) {
1717         /* We need to check if the requested address is in the RAM
1718          * because we don't want to map the entire memory in QEMU.
1719          * In that case just map until the end of the page.
1720          */
1721         if (block->offset == 0) {
1722             ptr = xen_map_cache(addr, 0, 0);
1723             goto unlock;
1724         }
1725
1726         block->host = xen_map_cache(block->offset, block->max_length, 1);
1727     }
1728     ptr = ramblock_ptr(block, addr - block->offset);
1729
1730 unlock:
1731     rcu_read_unlock();
1732     return ptr;
1733 }
1734
1735 /* Return a host pointer to guest's ram. Similar to qemu_get_ram_ptr
1736  * but takes a size argument.
1737  *
1738  * By the time this function returns, the returned pointer is not protected
1739  * by RCU anymore.  If the caller is not within an RCU critical section and
1740  * does not hold the iothread lock, it must have other means of protecting the
1741  * pointer, such as a reference to the region that includes the incoming
1742  * ram_addr_t.
1743  */
1744 static void *qemu_ram_ptr_length(ram_addr_t addr, hwaddr *size)
1745 {
1746     void *ptr;
1747     if (*size == 0) {
1748         return NULL;
1749     }
1750     if (xen_enabled()) {
1751         return xen_map_cache(addr, *size, 1);
1752     } else {
1753         RAMBlock *block;
1754         rcu_read_lock();
1755         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1756             if (addr - block->offset < block->max_length) {
1757                 if (addr - block->offset + *size > block->max_length)
1758                     *size = block->max_length - addr + block->offset;
1759                 ptr = ramblock_ptr(block, addr - block->offset);
1760                 rcu_read_unlock();
1761                 return ptr;
1762             }
1763         }
1764
1765         fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
1766         abort();
1767     }
1768 }
1769
1770 /* Some of the softmmu routines need to translate from a host pointer
1771  * (typically a TLB entry) back to a ram offset.
1772  *
1773  * By the time this function returns, the returned pointer is not protected
1774  * by RCU anymore.  If the caller is not within an RCU critical section and
1775  * does not hold the iothread lock, it must have other means of protecting the
1776  * pointer, such as a reference to the region that includes the incoming
1777  * ram_addr_t.
1778  */
1779 MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
1780 {
1781     RAMBlock *block;
1782     uint8_t *host = ptr;
1783     MemoryRegion *mr;
1784
1785     if (xen_enabled()) {
1786         rcu_read_lock();
1787         *ram_addr = xen_ram_addr_from_mapcache(ptr);
1788         mr = qemu_get_ram_block(*ram_addr)->mr;
1789         rcu_read_unlock();
1790         return mr;
1791     }
1792
1793     rcu_read_lock();
1794     block = atomic_rcu_read(&ram_list.mru_block);
1795     if (block && block->host && host - block->host < block->max_length) {
1796         goto found;
1797     }
1798
1799     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1800         /* This case append when the block is not mapped. */
1801         if (block->host == NULL) {
1802             continue;
1803         }
1804         if (host - block->host < block->max_length) {
1805             goto found;
1806         }
1807     }
1808
1809     rcu_read_unlock();
1810     return NULL;
1811
1812 found:
1813     *ram_addr = block->offset + (host - block->host);
1814     mr = block->mr;
1815     rcu_read_unlock();
1816     return mr;
1817 }
1818
1819 static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
1820                                uint64_t val, unsigned size)
1821 {
1822     if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
1823         tb_invalidate_phys_page_fast(ram_addr, size);
1824     }
1825     switch (size) {
1826     case 1:
1827         stb_p(qemu_get_ram_ptr(ram_addr), val);
1828         break;
1829     case 2:
1830         stw_p(qemu_get_ram_ptr(ram_addr), val);
1831         break;
1832     case 4:
1833         stl_p(qemu_get_ram_ptr(ram_addr), val);
1834         break;
1835     default:
1836         abort();
1837     }
1838     cpu_physical_memory_set_dirty_range_nocode(ram_addr, size);
1839     /* we remove the notdirty callback only if the code has been
1840        flushed */
1841     if (!cpu_physical_memory_is_clean(ram_addr)) {
1842         CPUArchState *env = current_cpu->env_ptr;
1843         tlb_set_dirty(env, current_cpu->mem_io_vaddr);
1844     }
1845 }
1846
1847 static bool notdirty_mem_accepts(void *opaque, hwaddr addr,
1848                                  unsigned size, bool is_write)
1849 {
1850     return is_write;
1851 }
1852
1853 static const MemoryRegionOps notdirty_mem_ops = {
1854     .write = notdirty_mem_write,
1855     .valid.accepts = notdirty_mem_accepts,
1856     .endianness = DEVICE_NATIVE_ENDIAN,
1857 };
1858
1859 /* Generate a debug exception if a watchpoint has been hit.  */
1860 static void check_watchpoint(int offset, int len, int flags)
1861 {
1862     CPUState *cpu = current_cpu;
1863     CPUArchState *env = cpu->env_ptr;
1864     target_ulong pc, cs_base;
1865     target_ulong vaddr;
1866     CPUWatchpoint *wp;
1867     int cpu_flags;
1868
1869     if (cpu->watchpoint_hit) {
1870         /* We re-entered the check after replacing the TB. Now raise
1871          * the debug interrupt so that is will trigger after the
1872          * current instruction. */
1873         cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
1874         return;
1875     }
1876     vaddr = (cpu->mem_io_vaddr & TARGET_PAGE_MASK) + offset;
1877     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
1878         if (cpu_watchpoint_address_matches(wp, vaddr, len)
1879             && (wp->flags & flags)) {
1880             if (flags == BP_MEM_READ) {
1881                 wp->flags |= BP_WATCHPOINT_HIT_READ;
1882             } else {
1883                 wp->flags |= BP_WATCHPOINT_HIT_WRITE;
1884             }
1885             wp->hitaddr = vaddr;
1886             if (!cpu->watchpoint_hit) {
1887                 cpu->watchpoint_hit = wp;
1888                 tb_check_watchpoint(cpu);
1889                 if (wp->flags & BP_STOP_BEFORE_ACCESS) {
1890                     cpu->exception_index = EXCP_DEBUG;
1891                     cpu_loop_exit(cpu);
1892                 } else {
1893                     cpu_get_tb_cpu_state(env, &pc, &cs_base, &cpu_flags);
1894                     tb_gen_code(cpu, pc, cs_base, cpu_flags, 1);
1895                     cpu_resume_from_signal(cpu, NULL);
1896                 }
1897             }
1898         } else {
1899             wp->flags &= ~BP_WATCHPOINT_HIT;
1900         }
1901     }
1902 }
1903
1904 /* Watchpoint access routines.  Watchpoints are inserted using TLB tricks,
1905    so these check for a hit then pass through to the normal out-of-line
1906    phys routines.  */
1907 static uint64_t watch_mem_read(void *opaque, hwaddr addr,
1908                                unsigned size)
1909 {
1910     check_watchpoint(addr & ~TARGET_PAGE_MASK, size, BP_MEM_READ);
1911     switch (size) {
1912     case 1: return ldub_phys(&address_space_memory, addr);
1913     case 2: return lduw_phys(&address_space_memory, addr);
1914     case 4: return ldl_phys(&address_space_memory, addr);
1915     default: abort();
1916     }
1917 }
1918
1919 static void watch_mem_write(void *opaque, hwaddr addr,
1920                             uint64_t val, unsigned size)
1921 {
1922     check_watchpoint(addr & ~TARGET_PAGE_MASK, size, BP_MEM_WRITE);
1923     switch (size) {
1924     case 1:
1925         stb_phys(&address_space_memory, addr, val);
1926         break;
1927     case 2:
1928         stw_phys(&address_space_memory, addr, val);
1929         break;
1930     case 4:
1931         stl_phys(&address_space_memory, addr, val);
1932         break;
1933     default: abort();
1934     }
1935 }
1936
1937 static const MemoryRegionOps watch_mem_ops = {
1938     .read = watch_mem_read,
1939     .write = watch_mem_write,
1940     .endianness = DEVICE_NATIVE_ENDIAN,
1941 };
1942
1943 static uint64_t subpage_read(void *opaque, hwaddr addr,
1944                              unsigned len)
1945 {
1946     subpage_t *subpage = opaque;
1947     uint8_t buf[8];
1948
1949 #if defined(DEBUG_SUBPAGE)
1950     printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__,
1951            subpage, len, addr);
1952 #endif
1953     address_space_read(subpage->as, addr + subpage->base, buf, len);
1954     switch (len) {
1955     case 1:
1956         return ldub_p(buf);
1957     case 2:
1958         return lduw_p(buf);
1959     case 4:
1960         return ldl_p(buf);
1961     case 8:
1962         return ldq_p(buf);
1963     default:
1964         abort();
1965     }
1966 }
1967
1968 static void subpage_write(void *opaque, hwaddr addr,
1969                           uint64_t value, unsigned len)
1970 {
1971     subpage_t *subpage = opaque;
1972     uint8_t buf[8];
1973
1974 #if defined(DEBUG_SUBPAGE)
1975     printf("%s: subpage %p len %u addr " TARGET_FMT_plx
1976            " value %"PRIx64"\n",
1977            __func__, subpage, len, addr, value);
1978 #endif
1979     switch (len) {
1980     case 1:
1981         stb_p(buf, value);
1982         break;
1983     case 2:
1984         stw_p(buf, value);
1985         break;
1986     case 4:
1987         stl_p(buf, value);
1988         break;
1989     case 8:
1990         stq_p(buf, value);
1991         break;
1992     default:
1993         abort();
1994     }
1995     address_space_write(subpage->as, addr + subpage->base, buf, len);
1996 }
1997
1998 static bool subpage_accepts(void *opaque, hwaddr addr,
1999                             unsigned len, bool is_write)
2000 {
2001     subpage_t *subpage = opaque;
2002 #if defined(DEBUG_SUBPAGE)
2003     printf("%s: subpage %p %c len %u addr " TARGET_FMT_plx "\n",
2004            __func__, subpage, is_write ? 'w' : 'r', len, addr);
2005 #endif
2006
2007     return address_space_access_valid(subpage->as, addr + subpage->base,
2008                                       len, is_write);
2009 }
2010
2011 static const MemoryRegionOps subpage_ops = {
2012     .read = subpage_read,
2013     .write = subpage_write,
2014     .impl.min_access_size = 1,
2015     .impl.max_access_size = 8,
2016     .valid.min_access_size = 1,
2017     .valid.max_access_size = 8,
2018     .valid.accepts = subpage_accepts,
2019     .endianness = DEVICE_NATIVE_ENDIAN,
2020 };
2021
2022 static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
2023                              uint16_t section)
2024 {
2025     int idx, eidx;
2026
2027     if (start >= TARGET_PAGE_SIZE || end >= TARGET_PAGE_SIZE)
2028         return -1;
2029     idx = SUBPAGE_IDX(start);
2030     eidx = SUBPAGE_IDX(end);
2031 #if defined(DEBUG_SUBPAGE)
2032     printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n",
2033            __func__, mmio, start, end, idx, eidx, section);
2034 #endif
2035     for (; idx <= eidx; idx++) {
2036         mmio->sub_section[idx] = section;
2037     }
2038
2039     return 0;
2040 }
2041
2042 static subpage_t *subpage_init(AddressSpace *as, hwaddr base)
2043 {
2044     subpage_t *mmio;
2045
2046     mmio = g_malloc0(sizeof(subpage_t));
2047
2048     mmio->as = as;
2049     mmio->base = base;
2050     memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
2051                           NULL, TARGET_PAGE_SIZE);
2052     mmio->iomem.subpage = true;
2053 #if defined(DEBUG_SUBPAGE)
2054     printf("%s: %p base " TARGET_FMT_plx " len %08x\n", __func__,
2055            mmio, base, TARGET_PAGE_SIZE);
2056 #endif
2057     subpage_register(mmio, 0, TARGET_PAGE_SIZE-1, PHYS_SECTION_UNASSIGNED);
2058
2059     return mmio;
2060 }
2061
2062 static uint16_t dummy_section(PhysPageMap *map, AddressSpace *as,
2063                               MemoryRegion *mr)
2064 {
2065     assert(as);
2066     MemoryRegionSection section = {
2067         .address_space = as,
2068         .mr = mr,
2069         .offset_within_address_space = 0,
2070         .offset_within_region = 0,
2071         .size = int128_2_64(),
2072     };
2073
2074     return phys_section_add(map, &section);
2075 }
2076
2077 MemoryRegion *iotlb_to_region(CPUState *cpu, hwaddr index)
2078 {
2079     AddressSpaceDispatch *d = atomic_rcu_read(&cpu->memory_dispatch);
2080     MemoryRegionSection *sections = d->map.sections;
2081
2082     return sections[index & ~TARGET_PAGE_MASK].mr;
2083 }
2084
2085 static void io_mem_init(void)
2086 {
2087     memory_region_init_io(&io_mem_rom, NULL, &unassigned_mem_ops, NULL, NULL, UINT64_MAX);
2088     memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
2089                           NULL, UINT64_MAX);
2090     memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
2091                           NULL, UINT64_MAX);
2092     memory_region_init_io(&io_mem_watch, NULL, &watch_mem_ops, NULL,
2093                           NULL, UINT64_MAX);
2094 }
2095
2096 static void mem_begin(MemoryListener *listener)
2097 {
2098     AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
2099     AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
2100     uint16_t n;
2101
2102     n = dummy_section(&d->map, as, &io_mem_unassigned);
2103     assert(n == PHYS_SECTION_UNASSIGNED);
2104     n = dummy_section(&d->map, as, &io_mem_notdirty);
2105     assert(n == PHYS_SECTION_NOTDIRTY);
2106     n = dummy_section(&d->map, as, &io_mem_rom);
2107     assert(n == PHYS_SECTION_ROM);
2108     n = dummy_section(&d->map, as, &io_mem_watch);
2109     assert(n == PHYS_SECTION_WATCH);
2110
2111     d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
2112     d->as = as;
2113     as->next_dispatch = d;
2114 }
2115
2116 static void address_space_dispatch_free(AddressSpaceDispatch *d)
2117 {
2118     phys_sections_free(&d->map);
2119     g_free(d);
2120 }
2121
2122 static void mem_commit(MemoryListener *listener)
2123 {
2124     AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
2125     AddressSpaceDispatch *cur = as->dispatch;
2126     AddressSpaceDispatch *next = as->next_dispatch;
2127
2128     phys_page_compact_all(next, next->map.nodes_nb);
2129
2130     atomic_rcu_set(&as->dispatch, next);
2131     if (cur) {
2132         call_rcu(cur, address_space_dispatch_free, rcu);
2133     }
2134 }
2135
2136 static void tcg_commit(MemoryListener *listener)
2137 {
2138     CPUState *cpu;
2139
2140     /* since each CPU stores ram addresses in its TLB cache, we must
2141        reset the modified entries */
2142     /* XXX: slow ! */
2143     CPU_FOREACH(cpu) {
2144         /* FIXME: Disentangle the cpu.h circular files deps so we can
2145            directly get the right CPU from listener.  */
2146         if (cpu->tcg_as_listener != listener) {
2147             continue;
2148         }
2149         cpu_reload_memory_map(cpu);
2150     }
2151 }
2152
2153 static void core_log_global_start(MemoryListener *listener)
2154 {
2155     cpu_physical_memory_set_dirty_tracking(true);
2156 }
2157
2158 static void core_log_global_stop(MemoryListener *listener)
2159 {
2160     cpu_physical_memory_set_dirty_tracking(false);
2161 }
2162
2163 static MemoryListener core_memory_listener = {
2164     .log_global_start = core_log_global_start,
2165     .log_global_stop = core_log_global_stop,
2166     .priority = 1,
2167 };
2168
2169 void address_space_init_dispatch(AddressSpace *as)
2170 {
2171     as->dispatch = NULL;
2172     as->dispatch_listener = (MemoryListener) {
2173         .begin = mem_begin,
2174         .commit = mem_commit,
2175         .region_add = mem_add,
2176         .region_nop = mem_add,
2177         .priority = 0,
2178     };
2179     memory_listener_register(&as->dispatch_listener, as);
2180 }
2181
2182 void address_space_unregister(AddressSpace *as)
2183 {
2184     memory_listener_unregister(&as->dispatch_listener);
2185 }
2186
2187 void address_space_destroy_dispatch(AddressSpace *as)
2188 {
2189     AddressSpaceDispatch *d = as->dispatch;
2190
2191     atomic_rcu_set(&as->dispatch, NULL);
2192     if (d) {
2193         call_rcu(d, address_space_dispatch_free, rcu);
2194     }
2195 }
2196
2197 static void memory_map_init(void)
2198 {
2199     system_memory = g_malloc(sizeof(*system_memory));
2200
2201     memory_region_init(system_memory, NULL, "system", UINT64_MAX);
2202     address_space_init(&address_space_memory, system_memory, "memory");
2203
2204     system_io = g_malloc(sizeof(*system_io));
2205     memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
2206                           65536);
2207     address_space_init(&address_space_io, system_io, "I/O");
2208
2209     memory_listener_register(&core_memory_listener, &address_space_memory);
2210 }
2211
2212 MemoryRegion *get_system_memory(void)
2213 {
2214     return system_memory;
2215 }
2216
2217 MemoryRegion *get_system_io(void)
2218 {
2219     return system_io;
2220 }
2221
2222 #endif /* !defined(CONFIG_USER_ONLY) */
2223
2224 /* physical memory access (slow version, mainly for debug) */
2225 #if defined(CONFIG_USER_ONLY)
2226 int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
2227                         uint8_t *buf, int len, int is_write)
2228 {
2229     int l, flags;
2230     target_ulong page;
2231     void * p;
2232
2233     while (len > 0) {
2234         page = addr & TARGET_PAGE_MASK;
2235         l = (page + TARGET_PAGE_SIZE) - addr;
2236         if (l > len)
2237             l = len;
2238         flags = page_get_flags(page);
2239         if (!(flags & PAGE_VALID))
2240             return -1;
2241         if (is_write) {
2242             if (!(flags & PAGE_WRITE))
2243                 return -1;
2244             /* XXX: this code should not depend on lock_user */
2245             if (!(p = lock_user(VERIFY_WRITE, addr, l, 0)))
2246                 return -1;
2247             memcpy(p, buf, l);
2248             unlock_user(p, addr, l);
2249         } else {
2250             if (!(flags & PAGE_READ))
2251                 return -1;
2252             /* XXX: this code should not depend on lock_user */
2253             if (!(p = lock_user(VERIFY_READ, addr, l, 1)))
2254                 return -1;
2255             memcpy(buf, p, l);
2256             unlock_user(p, addr, 0);
2257         }
2258         len -= l;
2259         buf += l;
2260         addr += l;
2261     }
2262     return 0;
2263 }
2264
2265 #else
2266
2267 static void invalidate_and_set_dirty(hwaddr addr,
2268                                      hwaddr length)
2269 {
2270     if (cpu_physical_memory_range_includes_clean(addr, length)) {
2271         tb_invalidate_phys_range(addr, addr + length, 0);
2272         cpu_physical_memory_set_dirty_range_nocode(addr, length);
2273     }
2274     xen_modified_memory(addr, length);
2275 }
2276
2277 static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
2278 {
2279     unsigned access_size_max = mr->ops->valid.max_access_size;
2280
2281     /* Regions are assumed to support 1-4 byte accesses unless
2282        otherwise specified.  */
2283     if (access_size_max == 0) {
2284         access_size_max = 4;
2285     }
2286
2287     /* Bound the maximum access by the alignment of the address.  */
2288     if (!mr->ops->impl.unaligned) {
2289         unsigned align_size_max = addr & -addr;
2290         if (align_size_max != 0 && align_size_max < access_size_max) {
2291             access_size_max = align_size_max;
2292         }
2293     }
2294
2295     /* Don't attempt accesses larger than the maximum.  */
2296     if (l > access_size_max) {
2297         l = access_size_max;
2298     }
2299     if (l & (l - 1)) {
2300         l = 1 << (qemu_fls(l) - 1);
2301     }
2302
2303     return l;
2304 }
2305
2306 bool address_space_rw(AddressSpace *as, hwaddr addr, uint8_t *buf,
2307                       int len, bool is_write)
2308 {
2309     hwaddr l;
2310     uint8_t *ptr;
2311     uint64_t val;
2312     hwaddr addr1;
2313     MemoryRegion *mr;
2314     bool error = false;
2315
2316     while (len > 0) {
2317         l = len;
2318         mr = address_space_translate(as, addr, &addr1, &l, is_write);
2319
2320         if (is_write) {
2321             if (!memory_access_is_direct(mr, is_write)) {
2322                 l = memory_access_size(mr, l, addr1);
2323                 /* XXX: could force current_cpu to NULL to avoid
2324                    potential bugs */
2325                 switch (l) {
2326                 case 8:
2327                     /* 64 bit write access */
2328                     val = ldq_p(buf);
2329                     error |= io_mem_write(mr, addr1, val, 8);
2330                     break;
2331                 case 4:
2332                     /* 32 bit write access */
2333                     val = ldl_p(buf);
2334                     error |= io_mem_write(mr, addr1, val, 4);
2335                     break;
2336                 case 2:
2337                     /* 16 bit write access */
2338                     val = lduw_p(buf);
2339                     error |= io_mem_write(mr, addr1, val, 2);
2340                     break;
2341                 case 1:
2342                     /* 8 bit write access */
2343                     val = ldub_p(buf);
2344                     error |= io_mem_write(mr, addr1, val, 1);
2345                     break;
2346                 default:
2347                     abort();
2348                 }
2349             } else {
2350                 addr1 += memory_region_get_ram_addr(mr);
2351                 /* RAM case */
2352                 ptr = qemu_get_ram_ptr(addr1);
2353                 memcpy(ptr, buf, l);
2354                 invalidate_and_set_dirty(addr1, l);
2355             }
2356         } else {
2357             if (!memory_access_is_direct(mr, is_write)) {
2358                 /* I/O case */
2359                 l = memory_access_size(mr, l, addr1);
2360                 switch (l) {
2361                 case 8:
2362                     /* 64 bit read access */
2363                     error |= io_mem_read(mr, addr1, &val, 8);
2364                     stq_p(buf, val);
2365                     break;
2366                 case 4:
2367                     /* 32 bit read access */
2368                     error |= io_mem_read(mr, addr1, &val, 4);
2369                     stl_p(buf, val);
2370                     break;
2371                 case 2:
2372                     /* 16 bit read access */
2373                     error |= io_mem_read(mr, addr1, &val, 2);
2374                     stw_p(buf, val);
2375                     break;
2376                 case 1:
2377                     /* 8 bit read access */
2378                     error |= io_mem_read(mr, addr1, &val, 1);
2379                     stb_p(buf, val);
2380                     break;
2381                 default:
2382                     abort();
2383                 }
2384             } else {
2385                 /* RAM case */
2386                 ptr = qemu_get_ram_ptr(mr->ram_addr + addr1);
2387                 memcpy(buf, ptr, l);
2388             }
2389         }
2390         len -= l;
2391         buf += l;
2392         addr += l;
2393     }
2394
2395     return error;
2396 }
2397
2398 bool address_space_write(AddressSpace *as, hwaddr addr,
2399                          const uint8_t *buf, int len)
2400 {
2401     return address_space_rw(as, addr, (uint8_t *)buf, len, true);
2402 }
2403
2404 bool address_space_read(AddressSpace *as, hwaddr addr, uint8_t *buf, int len)
2405 {
2406     return address_space_rw(as, addr, buf, len, false);
2407 }
2408
2409
2410 void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
2411                             int len, int is_write)
2412 {
2413     address_space_rw(&address_space_memory, addr, buf, len, is_write);
2414 }
2415
2416 enum write_rom_type {
2417     WRITE_DATA,
2418     FLUSH_CACHE,
2419 };
2420
2421 static inline void cpu_physical_memory_write_rom_internal(AddressSpace *as,
2422     hwaddr addr, const uint8_t *buf, int len, enum write_rom_type type)
2423 {
2424     hwaddr l;
2425     uint8_t *ptr;
2426     hwaddr addr1;
2427     MemoryRegion *mr;
2428
2429     while (len > 0) {
2430         l = len;
2431         mr = address_space_translate(as, addr, &addr1, &l, true);
2432
2433         if (!(memory_region_is_ram(mr) ||
2434               memory_region_is_romd(mr))) {
2435             /* do nothing */
2436         } else {
2437             addr1 += memory_region_get_ram_addr(mr);
2438             /* ROM/RAM case */
2439             ptr = qemu_get_ram_ptr(addr1);
2440             switch (type) {
2441             case WRITE_DATA:
2442                 memcpy(ptr, buf, l);
2443                 invalidate_and_set_dirty(addr1, l);
2444                 break;
2445             case FLUSH_CACHE:
2446                 flush_icache_range((uintptr_t)ptr, (uintptr_t)ptr + l);
2447                 break;
2448             }
2449         }
2450         len -= l;
2451         buf += l;
2452         addr += l;
2453     }
2454 }
2455
2456 /* used for ROM loading : can write in RAM and ROM */
2457 void cpu_physical_memory_write_rom(AddressSpace *as, hwaddr addr,
2458                                    const uint8_t *buf, int len)
2459 {
2460     cpu_physical_memory_write_rom_internal(as, addr, buf, len, WRITE_DATA);
2461 }
2462
2463 void cpu_flush_icache_range(hwaddr start, int len)
2464 {
2465     /*
2466      * This function should do the same thing as an icache flush that was
2467      * triggered from within the guest. For TCG we are always cache coherent,
2468      * so there is no need to flush anything. For KVM / Xen we need to flush
2469      * the host's instruction cache at least.
2470      */
2471     if (tcg_enabled()) {
2472         return;
2473     }
2474
2475     cpu_physical_memory_write_rom_internal(&address_space_memory,
2476                                            start, NULL, len, FLUSH_CACHE);
2477 }
2478
2479 typedef struct {
2480     MemoryRegion *mr;
2481     void *buffer;
2482     hwaddr addr;
2483     hwaddr len;
2484 } BounceBuffer;
2485
2486 static BounceBuffer bounce;
2487
2488 typedef struct MapClient {
2489     void *opaque;
2490     void (*callback)(void *opaque);
2491     QLIST_ENTRY(MapClient) link;
2492 } MapClient;
2493
2494 static QLIST_HEAD(map_client_list, MapClient) map_client_list
2495     = QLIST_HEAD_INITIALIZER(map_client_list);
2496
2497 void *cpu_register_map_client(void *opaque, void (*callback)(void *opaque))
2498 {
2499     MapClient *client = g_malloc(sizeof(*client));
2500
2501     client->opaque = opaque;
2502     client->callback = callback;
2503     QLIST_INSERT_HEAD(&map_client_list, client, link);
2504     return client;
2505 }
2506
2507 static void cpu_unregister_map_client(void *_client)
2508 {
2509     MapClient *client = (MapClient *)_client;
2510
2511     QLIST_REMOVE(client, link);
2512     g_free(client);
2513 }
2514
2515 static void cpu_notify_map_clients(void)
2516 {
2517     MapClient *client;
2518
2519     while (!QLIST_EMPTY(&map_client_list)) {
2520         client = QLIST_FIRST(&map_client_list);
2521         client->callback(client->opaque);
2522         cpu_unregister_map_client(client);
2523     }
2524 }
2525
2526 bool address_space_access_valid(AddressSpace *as, hwaddr addr, int len, bool is_write)
2527 {
2528     MemoryRegion *mr;
2529     hwaddr l, xlat;
2530
2531     while (len > 0) {
2532         l = len;
2533         mr = address_space_translate(as, addr, &xlat, &l, is_write);
2534         if (!memory_access_is_direct(mr, is_write)) {
2535             l = memory_access_size(mr, l, addr);
2536             if (!memory_region_access_valid(mr, xlat, l, is_write)) {
2537                 return false;
2538             }
2539         }
2540
2541         len -= l;
2542         addr += l;
2543     }
2544     return true;
2545 }
2546
2547 /* Map a physical memory region into a host virtual address.
2548  * May map a subset of the requested range, given by and returned in *plen.
2549  * May return NULL if resources needed to perform the mapping are exhausted.
2550  * Use only for reads OR writes - not for read-modify-write operations.
2551  * Use cpu_register_map_client() to know when retrying the map operation is
2552  * likely to succeed.
2553  */
2554 void *address_space_map(AddressSpace *as,
2555                         hwaddr addr,
2556                         hwaddr *plen,
2557                         bool is_write)
2558 {
2559     hwaddr len = *plen;
2560     hwaddr done = 0;
2561     hwaddr l, xlat, base;
2562     MemoryRegion *mr, *this_mr;
2563     ram_addr_t raddr;
2564
2565     if (len == 0) {
2566         return NULL;
2567     }
2568
2569     l = len;
2570     mr = address_space_translate(as, addr, &xlat, &l, is_write);
2571     if (!memory_access_is_direct(mr, is_write)) {
2572         if (bounce.buffer) {
2573             return NULL;
2574         }
2575         /* Avoid unbounded allocations */
2576         l = MIN(l, TARGET_PAGE_SIZE);
2577         bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
2578         bounce.addr = addr;
2579         bounce.len = l;
2580
2581         memory_region_ref(mr);
2582         bounce.mr = mr;
2583         if (!is_write) {
2584             address_space_read(as, addr, bounce.buffer, l);
2585         }
2586
2587         *plen = l;
2588         return bounce.buffer;
2589     }
2590
2591     base = xlat;
2592     raddr = memory_region_get_ram_addr(mr);
2593
2594     for (;;) {
2595         len -= l;
2596         addr += l;
2597         done += l;
2598         if (len == 0) {
2599             break;
2600         }
2601
2602         l = len;
2603         this_mr = address_space_translate(as, addr, &xlat, &l, is_write);
2604         if (this_mr != mr || xlat != base + done) {
2605             break;
2606         }
2607     }
2608
2609     memory_region_ref(mr);
2610     *plen = done;
2611     return qemu_ram_ptr_length(raddr + base, plen);
2612 }
2613
2614 /* Unmaps a memory region previously mapped by address_space_map().
2615  * Will also mark the memory as dirty if is_write == 1.  access_len gives
2616  * the amount of memory that was actually read or written by the caller.
2617  */
2618 void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
2619                          int is_write, hwaddr access_len)
2620 {
2621     if (buffer != bounce.buffer) {
2622         MemoryRegion *mr;
2623         ram_addr_t addr1;
2624
2625         mr = qemu_ram_addr_from_host(buffer, &addr1);
2626         assert(mr != NULL);
2627         if (is_write) {
2628             invalidate_and_set_dirty(addr1, access_len);
2629         }
2630         if (xen_enabled()) {
2631             xen_invalidate_map_cache_entry(buffer);
2632         }
2633         memory_region_unref(mr);
2634         return;
2635     }
2636     if (is_write) {
2637         address_space_write(as, bounce.addr, bounce.buffer, access_len);
2638     }
2639     qemu_vfree(bounce.buffer);
2640     bounce.buffer = NULL;
2641     memory_region_unref(bounce.mr);
2642     cpu_notify_map_clients();
2643 }
2644
2645 void *cpu_physical_memory_map(hwaddr addr,
2646                               hwaddr *plen,
2647                               int is_write)
2648 {
2649     return address_space_map(&address_space_memory, addr, plen, is_write);
2650 }
2651
2652 void cpu_physical_memory_unmap(void *buffer, hwaddr len,
2653                                int is_write, hwaddr access_len)
2654 {
2655     return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
2656 }
2657
2658 /* warning: addr must be aligned */
2659 static inline uint32_t ldl_phys_internal(AddressSpace *as, hwaddr addr,
2660                                          enum device_endian endian)
2661 {
2662     uint8_t *ptr;
2663     uint64_t val;
2664     MemoryRegion *mr;
2665     hwaddr l = 4;
2666     hwaddr addr1;
2667
2668     mr = address_space_translate(as, addr, &addr1, &l, false);
2669     if (l < 4 || !memory_access_is_direct(mr, false)) {
2670         /* I/O case */
2671         io_mem_read(mr, addr1, &val, 4);
2672 #if defined(TARGET_WORDS_BIGENDIAN)
2673         if (endian == DEVICE_LITTLE_ENDIAN) {
2674             val = bswap32(val);
2675         }
2676 #else
2677         if (endian == DEVICE_BIG_ENDIAN) {
2678             val = bswap32(val);
2679         }
2680 #endif
2681     } else {
2682         /* RAM case */
2683         ptr = qemu_get_ram_ptr((memory_region_get_ram_addr(mr)
2684                                 & TARGET_PAGE_MASK)
2685                                + addr1);
2686         switch (endian) {
2687         case DEVICE_LITTLE_ENDIAN:
2688             val = ldl_le_p(ptr);
2689             break;
2690         case DEVICE_BIG_ENDIAN:
2691             val = ldl_be_p(ptr);
2692             break;
2693         default:
2694             val = ldl_p(ptr);
2695             break;
2696         }
2697     }
2698     return val;
2699 }
2700
2701 uint32_t ldl_phys(AddressSpace *as, hwaddr addr)
2702 {
2703     return ldl_phys_internal(as, addr, DEVICE_NATIVE_ENDIAN);
2704 }
2705
2706 uint32_t ldl_le_phys(AddressSpace *as, hwaddr addr)
2707 {
2708     return ldl_phys_internal(as, addr, DEVICE_LITTLE_ENDIAN);
2709 }
2710
2711 uint32_t ldl_be_phys(AddressSpace *as, hwaddr addr)
2712 {
2713     return ldl_phys_internal(as, addr, DEVICE_BIG_ENDIAN);
2714 }
2715
2716 /* warning: addr must be aligned */
2717 static inline uint64_t ldq_phys_internal(AddressSpace *as, hwaddr addr,
2718                                          enum device_endian endian)
2719 {
2720     uint8_t *ptr;
2721     uint64_t val;
2722     MemoryRegion *mr;
2723     hwaddr l = 8;
2724     hwaddr addr1;
2725
2726     mr = address_space_translate(as, addr, &addr1, &l,
2727                                  false);
2728     if (l < 8 || !memory_access_is_direct(mr, false)) {
2729         /* I/O case */
2730         io_mem_read(mr, addr1, &val, 8);
2731 #if defined(TARGET_WORDS_BIGENDIAN)
2732         if (endian == DEVICE_LITTLE_ENDIAN) {
2733             val = bswap64(val);
2734         }
2735 #else
2736         if (endian == DEVICE_BIG_ENDIAN) {
2737             val = bswap64(val);
2738         }
2739 #endif
2740     } else {
2741         /* RAM case */
2742         ptr = qemu_get_ram_ptr((memory_region_get_ram_addr(mr)
2743                                 & TARGET_PAGE_MASK)
2744                                + addr1);
2745         switch (endian) {
2746         case DEVICE_LITTLE_ENDIAN:
2747             val = ldq_le_p(ptr);
2748             break;
2749         case DEVICE_BIG_ENDIAN:
2750             val = ldq_be_p(ptr);
2751             break;
2752         default:
2753             val = ldq_p(ptr);
2754             break;
2755         }
2756     }
2757     return val;
2758 }
2759
2760 uint64_t ldq_phys(AddressSpace *as, hwaddr addr)
2761 {
2762     return ldq_phys_internal(as, addr, DEVICE_NATIVE_ENDIAN);
2763 }
2764
2765 uint64_t ldq_le_phys(AddressSpace *as, hwaddr addr)
2766 {
2767     return ldq_phys_internal(as, addr, DEVICE_LITTLE_ENDIAN);
2768 }
2769
2770 uint64_t ldq_be_phys(AddressSpace *as, hwaddr addr)
2771 {
2772     return ldq_phys_internal(as, addr, DEVICE_BIG_ENDIAN);
2773 }
2774
2775 /* XXX: optimize */
2776 uint32_t ldub_phys(AddressSpace *as, hwaddr addr)
2777 {
2778     uint8_t val;
2779     address_space_rw(as, addr, &val, 1, 0);
2780     return val;
2781 }
2782
2783 /* warning: addr must be aligned */
2784 static inline uint32_t lduw_phys_internal(AddressSpace *as, hwaddr addr,
2785                                           enum device_endian endian)
2786 {
2787     uint8_t *ptr;
2788     uint64_t val;
2789     MemoryRegion *mr;
2790     hwaddr l = 2;
2791     hwaddr addr1;
2792
2793     mr = address_space_translate(as, addr, &addr1, &l,
2794                                  false);
2795     if (l < 2 || !memory_access_is_direct(mr, false)) {
2796         /* I/O case */
2797         io_mem_read(mr, addr1, &val, 2);
2798 #if defined(TARGET_WORDS_BIGENDIAN)
2799         if (endian == DEVICE_LITTLE_ENDIAN) {
2800             val = bswap16(val);
2801         }
2802 #else
2803         if (endian == DEVICE_BIG_ENDIAN) {
2804             val = bswap16(val);
2805         }
2806 #endif
2807     } else {
2808         /* RAM case */
2809         ptr = qemu_get_ram_ptr((memory_region_get_ram_addr(mr)
2810                                 & TARGET_PAGE_MASK)
2811                                + addr1);
2812         switch (endian) {
2813         case DEVICE_LITTLE_ENDIAN:
2814             val = lduw_le_p(ptr);
2815             break;
2816         case DEVICE_BIG_ENDIAN:
2817             val = lduw_be_p(ptr);
2818             break;
2819         default:
2820             val = lduw_p(ptr);
2821             break;
2822         }
2823     }
2824     return val;
2825 }
2826
2827 uint32_t lduw_phys(AddressSpace *as, hwaddr addr)
2828 {
2829     return lduw_phys_internal(as, addr, DEVICE_NATIVE_ENDIAN);
2830 }
2831
2832 uint32_t lduw_le_phys(AddressSpace *as, hwaddr addr)
2833 {
2834     return lduw_phys_internal(as, addr, DEVICE_LITTLE_ENDIAN);
2835 }
2836
2837 uint32_t lduw_be_phys(AddressSpace *as, hwaddr addr)
2838 {
2839     return lduw_phys_internal(as, addr, DEVICE_BIG_ENDIAN);
2840 }
2841
2842 /* warning: addr must be aligned. The ram page is not masked as dirty
2843    and the code inside is not invalidated. It is useful if the dirty
2844    bits are used to track modified PTEs */
2845 void stl_phys_notdirty(AddressSpace *as, hwaddr addr, uint32_t val)
2846 {
2847     uint8_t *ptr;
2848     MemoryRegion *mr;
2849     hwaddr l = 4;
2850     hwaddr addr1;
2851
2852     mr = address_space_translate(as, addr, &addr1, &l,
2853                                  true);
2854     if (l < 4 || !memory_access_is_direct(mr, true)) {
2855         io_mem_write(mr, addr1, val, 4);
2856     } else {
2857         addr1 += memory_region_get_ram_addr(mr) & TARGET_PAGE_MASK;
2858         ptr = qemu_get_ram_ptr(addr1);
2859         stl_p(ptr, val);
2860
2861         if (unlikely(in_migration)) {
2862             if (cpu_physical_memory_is_clean(addr1)) {
2863                 /* invalidate code */
2864                 tb_invalidate_phys_page_range(addr1, addr1 + 4, 0);
2865                 /* set dirty bit */
2866                 cpu_physical_memory_set_dirty_range_nocode(addr1, 4);
2867             }
2868         }
2869     }
2870 }
2871
2872 /* warning: addr must be aligned */
2873 static inline void stl_phys_internal(AddressSpace *as,
2874                                      hwaddr addr, uint32_t val,
2875                                      enum device_endian endian)
2876 {
2877     uint8_t *ptr;
2878     MemoryRegion *mr;
2879     hwaddr l = 4;
2880     hwaddr addr1;
2881
2882     mr = address_space_translate(as, addr, &addr1, &l,
2883                                  true);
2884     if (l < 4 || !memory_access_is_direct(mr, true)) {
2885 #if defined(TARGET_WORDS_BIGENDIAN)
2886         if (endian == DEVICE_LITTLE_ENDIAN) {
2887             val = bswap32(val);
2888         }
2889 #else
2890         if (endian == DEVICE_BIG_ENDIAN) {
2891             val = bswap32(val);
2892         }
2893 #endif
2894         io_mem_write(mr, addr1, val, 4);
2895     } else {
2896         /* RAM case */
2897         addr1 += memory_region_get_ram_addr(mr) & TARGET_PAGE_MASK;
2898         ptr = qemu_get_ram_ptr(addr1);
2899         switch (endian) {
2900         case DEVICE_LITTLE_ENDIAN:
2901             stl_le_p(ptr, val);
2902             break;
2903         case DEVICE_BIG_ENDIAN:
2904             stl_be_p(ptr, val);
2905             break;
2906         default:
2907             stl_p(ptr, val);
2908             break;
2909         }
2910         invalidate_and_set_dirty(addr1, 4);
2911     }
2912 }
2913
2914 void stl_phys(AddressSpace *as, hwaddr addr, uint32_t val)
2915 {
2916     stl_phys_internal(as, addr, val, DEVICE_NATIVE_ENDIAN);
2917 }
2918
2919 void stl_le_phys(AddressSpace *as, hwaddr addr, uint32_t val)
2920 {
2921     stl_phys_internal(as, addr, val, DEVICE_LITTLE_ENDIAN);
2922 }
2923
2924 void stl_be_phys(AddressSpace *as, hwaddr addr, uint32_t val)
2925 {
2926     stl_phys_internal(as, addr, val, DEVICE_BIG_ENDIAN);
2927 }
2928
2929 /* XXX: optimize */
2930 void stb_phys(AddressSpace *as, hwaddr addr, uint32_t val)
2931 {
2932     uint8_t v = val;
2933     address_space_rw(as, addr, &v, 1, 1);
2934 }
2935
2936 /* warning: addr must be aligned */
2937 static inline void stw_phys_internal(AddressSpace *as,
2938                                      hwaddr addr, uint32_t val,
2939                                      enum device_endian endian)
2940 {
2941     uint8_t *ptr;
2942     MemoryRegion *mr;
2943     hwaddr l = 2;
2944     hwaddr addr1;
2945
2946     mr = address_space_translate(as, addr, &addr1, &l, true);
2947     if (l < 2 || !memory_access_is_direct(mr, true)) {
2948 #if defined(TARGET_WORDS_BIGENDIAN)
2949         if (endian == DEVICE_LITTLE_ENDIAN) {
2950             val = bswap16(val);
2951         }
2952 #else
2953         if (endian == DEVICE_BIG_ENDIAN) {
2954             val = bswap16(val);
2955         }
2956 #endif
2957         io_mem_write(mr, addr1, val, 2);
2958     } else {
2959         /* RAM case */
2960         addr1 += memory_region_get_ram_addr(mr) & TARGET_PAGE_MASK;
2961         ptr = qemu_get_ram_ptr(addr1);
2962         switch (endian) {
2963         case DEVICE_LITTLE_ENDIAN:
2964             stw_le_p(ptr, val);
2965             break;
2966         case DEVICE_BIG_ENDIAN:
2967             stw_be_p(ptr, val);
2968             break;
2969         default:
2970             stw_p(ptr, val);
2971             break;
2972         }
2973         invalidate_and_set_dirty(addr1, 2);
2974     }
2975 }
2976
2977 void stw_phys(AddressSpace *as, hwaddr addr, uint32_t val)
2978 {
2979     stw_phys_internal(as, addr, val, DEVICE_NATIVE_ENDIAN);
2980 }
2981
2982 void stw_le_phys(AddressSpace *as, hwaddr addr, uint32_t val)
2983 {
2984     stw_phys_internal(as, addr, val, DEVICE_LITTLE_ENDIAN);
2985 }
2986
2987 void stw_be_phys(AddressSpace *as, hwaddr addr, uint32_t val)
2988 {
2989     stw_phys_internal(as, addr, val, DEVICE_BIG_ENDIAN);
2990 }
2991
2992 /* XXX: optimize */
2993 void stq_phys(AddressSpace *as, hwaddr addr, uint64_t val)
2994 {
2995     val = tswap64(val);
2996     address_space_rw(as, addr, (void *) &val, 8, 1);
2997 }
2998
2999 void stq_le_phys(AddressSpace *as, hwaddr addr, uint64_t val)
3000 {
3001     val = cpu_to_le64(val);
3002     address_space_rw(as, addr, (void *) &val, 8, 1);
3003 }
3004
3005 void stq_be_phys(AddressSpace *as, hwaddr addr, uint64_t val)
3006 {
3007     val = cpu_to_be64(val);
3008     address_space_rw(as, addr, (void *) &val, 8, 1);
3009 }
3010
3011 /* virtual memory access for debug (includes writing to ROM) */
3012 int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
3013                         uint8_t *buf, int len, int is_write)
3014 {
3015     int l;
3016     hwaddr phys_addr;
3017     target_ulong page;
3018
3019     while (len > 0) {
3020         page = addr & TARGET_PAGE_MASK;
3021         phys_addr = cpu_get_phys_page_debug(cpu, page);
3022         /* if no physical page mapped, return an error */
3023         if (phys_addr == -1)
3024             return -1;
3025         l = (page + TARGET_PAGE_SIZE) - addr;
3026         if (l > len)
3027             l = len;
3028         phys_addr += (addr & ~TARGET_PAGE_MASK);
3029         if (is_write) {
3030             cpu_physical_memory_write_rom(cpu->as, phys_addr, buf, l);
3031         } else {
3032             address_space_rw(cpu->as, phys_addr, buf, l, 0);
3033         }
3034         len -= l;
3035         buf += l;
3036         addr += l;
3037     }
3038     return 0;
3039 }
3040 #endif
3041
3042 /*
3043  * A helper function for the _utterly broken_ virtio device model to find out if
3044  * it's running on a big endian machine. Don't do this at home kids!
3045  */
3046 bool target_words_bigendian(void);
3047 bool target_words_bigendian(void)
3048 {
3049 #if defined(TARGET_WORDS_BIGENDIAN)
3050     return true;
3051 #else
3052     return false;
3053 #endif
3054 }
3055
3056 #ifndef CONFIG_USER_ONLY
3057 bool cpu_physical_memory_is_io(hwaddr phys_addr)
3058 {
3059     MemoryRegion*mr;
3060     hwaddr l = 1;
3061
3062     mr = address_space_translate(&address_space_memory,
3063                                  phys_addr, &phys_addr, &l, false);
3064
3065     return !(memory_region_is_ram(mr) ||
3066              memory_region_is_romd(mr));
3067 }
3068
3069 void qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
3070 {
3071     RAMBlock *block;
3072
3073     rcu_read_lock();
3074     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
3075         func(block->host, block->offset, block->used_length, opaque);
3076     }
3077     rcu_read_unlock();
3078 }
3079 #endif