exec.c

   1 /*
   2  *  Virtual page mapping
   3  *
   4  *  Copyright (c) 2003 Fabrice Bellard
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19 #include "config.h"
  20 #ifndef _WIN32
  21 #include <sys/types.h>
  22 #include <sys/mman.h>
  23 #endif
  24
  25 #include "qemu-common.h"
  26 #include "cpu.h"
  27 #include "tcg.h"
  28 #include "hw/hw.h"
  29 #if !defined(CONFIG_USER_ONLY)
  30 #include "hw/boards.h"
  31 #endif
  32 #include "hw/qdev.h"
  33 #include "qemu/osdep.h"
  34 #include "sysemu/kvm.h"
  35 #include "sysemu/sysemu.h"
  36 #include "hw/xen/xen.h"
  37 #include "qemu/timer.h"
  38 #include "qemu/config-file.h"
  39 #include "qemu/error-report.h"
  40 #include "exec/memory.h"
  41 #include "sysemu/dma.h"
  42 #include "exec/address-spaces.h"
  43 #if defined(CONFIG_USER_ONLY)
  44 #include <qemu.h>
  45 #else /* !CONFIG_USER_ONLY */
  46 #include "sysemu/xen-mapcache.h"
  47 #include "trace.h"
  48 #endif
  49 #include "exec/cpu-all.h"
  50 #include "qemu/rcu_queue.h"
  51 #include "exec/cputlb.h"
  52 #include "translate-all.h"
  53
  54 #include "exec/memory-internal.h"
  55 #include "exec/ram_addr.h"
  56
  57 #include "qemu/range.h"
  58
  59 //#define DEBUG_SUBPAGE
  60
  61 #if !defined(CONFIG_USER_ONLY)
  62 static bool in_migration;
  63
  64 /* ram_list is read under rcu_read_lock()/rcu_read_unlock().  Writes
  65  * are protected by the ramlist lock.
  66  */
  67 RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
  68
  69 static MemoryRegion *system_memory;
  70 static MemoryRegion *system_io;
  71
  72 AddressSpace address_space_io;
  73 AddressSpace address_space_memory;
  74
  75 MemoryRegion io_mem_rom, io_mem_notdirty;
  76 static MemoryRegion io_mem_unassigned;
  77
  78 /* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */
  79 #define RAM_PREALLOC   (1 << 0)
  80
  81 /* RAM is mmap-ed with MAP_SHARED */
  82 #define RAM_SHARED     (1 << 1)
  83
  84 /* Only a portion of RAM (used_length) is actually used, and migrated.
  85  * This used_length size can change across reboots.
  86  */
  87 #define RAM_RESIZEABLE (1 << 2)
  88
  89 #endif
  90
  91 struct CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
  92 /* current CPU in the current thread. It is only valid inside
  93    cpu_exec() */
  94 DEFINE_TLS(CPUState *, current_cpu);
  95 /* 0 = Do not count executed instructions.
  96    1 = Precise instruction counting.
  97    2 = Adaptive rate instruction counting.  */
  98 int use_icount;
  99
 100 #if !defined(CONFIG_USER_ONLY)
 101
 102 typedef struct PhysPageEntry PhysPageEntry;
 103
 104 struct PhysPageEntry {
 105     /* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
 106     uint32_t skip : 6;
 107      /* index into phys_sections (!skip) or phys_map_nodes (skip) */
 108     uint32_t ptr : 26;
 109 };
 110
 111 #define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)
 112
 113 /* Size of the L2 (and L3, etc) page tables.  */
 114 #define ADDR_SPACE_BITS 64
 115
 116 #define P_L2_BITS 9
 117 #define P_L2_SIZE (1 << P_L2_BITS)
 118
 119 #define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
 120
 121 typedef PhysPageEntry Node[P_L2_SIZE];
 122
 123 typedef struct PhysPageMap {
 124     struct rcu_head rcu;
 125
 126     unsigned sections_nb;
 127     unsigned sections_nb_alloc;
 128     unsigned nodes_nb;
 129     unsigned nodes_nb_alloc;
 130     Node *nodes;
 131     MemoryRegionSection *sections;
 132 } PhysPageMap;
 133
 134 struct AddressSpaceDispatch {
 135     struct rcu_head rcu;
 136
 137     /* This is a multi-level map on the physical address space.
 138      * The bottom level has pointers to MemoryRegionSections.
 139      */
 140     PhysPageEntry phys_map;
 141     PhysPageMap map;
 142     AddressSpace *as;
 143 };
 144
 145 #define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
 146 typedef struct subpage_t {
 147     MemoryRegion iomem;
 148     AddressSpace *as;
 149     hwaddr base;
 150     uint16_t sub_section[TARGET_PAGE_SIZE];
 151 } subpage_t;
 152
 153 #define PHYS_SECTION_UNASSIGNED 0
 154 #define PHYS_SECTION_NOTDIRTY 1
 155 #define PHYS_SECTION_ROM 2
 156 #define PHYS_SECTION_WATCH 3
 157
 158 static void io_mem_init(void);
 159 static void memory_map_init(void);
 160 static void tcg_commit(MemoryListener *listener);
 161
 162 static MemoryRegion io_mem_watch;
 163 #endif
 164
 165 #if !defined(CONFIG_USER_ONLY)
 166
 167 static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
 168 {
 169     if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
 170         map->nodes_nb_alloc = MAX(map->nodes_nb_alloc * 2, 16);
 171         map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, map->nodes_nb + nodes);
 172         map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
 173     }
 174 }
 175
 176 static uint32_t phys_map_node_alloc(PhysPageMap *map)
 177 {
 178     unsigned i;
 179     uint32_t ret;
 180
 181     ret = map->nodes_nb++;
 182     assert(ret != PHYS_MAP_NODE_NIL);
 183     assert(ret != map->nodes_nb_alloc);
 184     for (i = 0; i < P_L2_SIZE; ++i) {
 185         map->nodes[ret][i].skip = 1;
 186         map->nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
 187     }
 188     return ret;
 189 }
 190
 191 static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp,
 192                                 hwaddr *index, hwaddr *nb, uint16_t leaf,
 193                                 int level)
 194 {
 195     PhysPageEntry *p;
 196     int i;
 197     hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
 198
 199     if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
 200         lp->ptr = phys_map_node_alloc(map);
 201         p = map->nodes[lp->ptr];
 202         if (level == 0) {
 203             for (i = 0; i < P_L2_SIZE; i++) {
 204                 p[i].skip = 0;
 205                 p[i].ptr = PHYS_SECTION_UNASSIGNED;
 206             }
 207         }
 208     } else {
 209         p = map->nodes[lp->ptr];
 210     }
 211     lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)];
 212
 213     while (*nb && lp < &p[P_L2_SIZE]) {
 214         if ((*index & (step - 1)) == 0 && *nb >= step) {
 215             lp->skip = 0;
 216             lp->ptr = leaf;
 217             *index += step;
 218             *nb -= step;
 219         } else {
 220             phys_page_set_level(map, lp, index, nb, leaf, level - 1);
 221         }
 222         ++lp;
 223     }
 224 }
 225
 226 static void phys_page_set(AddressSpaceDispatch *d,
 227                           hwaddr index, hwaddr nb,
 228                           uint16_t leaf)
 229 {
 230     /* Wildly overreserve - it doesn't matter much. */
 231     phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
 232
 233     phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
 234 }
 235
 236 /* Compact a non leaf page entry. Simply detect that the entry has a single child,
 237  * and update our entry so we can skip it and go directly to the destination.
 238  */
 239 static void phys_page_compact(PhysPageEntry *lp, Node *nodes, unsigned long *compacted)
 240 {
 241     unsigned valid_ptr = P_L2_SIZE;
 242     int valid = 0;
 243     PhysPageEntry *p;
 244     int i;
 245
 246     if (lp->ptr == PHYS_MAP_NODE_NIL) {
 247         return;
 248     }
 249
 250     p = nodes[lp->ptr];
 251     for (i = 0; i < P_L2_SIZE; i++) {
 252         if (p[i].ptr == PHYS_MAP_NODE_NIL) {
 253             continue;
 254         }
 255
 256         valid_ptr = i;
 257         valid++;
 258         if (p[i].skip) {
 259             phys_page_compact(&p[i], nodes, compacted);
 260         }
 261     }
 262
 263     /* We can only compress if there's only one child. */
 264     if (valid != 1) {
 265         return;
 266     }
 267
 268     assert(valid_ptr < P_L2_SIZE);
 269
 270     /* Don't compress if it won't fit in the # of bits we have. */
 271     if (lp->skip + p[valid_ptr].skip >= (1 << 3)) {
 272         return;
 273     }
 274
 275     lp->ptr = p[valid_ptr].ptr;
 276     if (!p[valid_ptr].skip) {
 277         /* If our only child is a leaf, make this a leaf. */
 278         /* By design, we should have made this node a leaf to begin with so we
 279          * should never reach here.
 280          * But since it's so simple to handle this, let's do it just in case we
 281          * change this rule.
 282          */
 283         lp->skip = 0;
 284     } else {
 285         lp->skip += p[valid_ptr].skip;
 286     }
 287 }
 288
 289 static void phys_page_compact_all(AddressSpaceDispatch *d, int nodes_nb)
 290 {
 291     DECLARE_BITMAP(compacted, nodes_nb);
 292
 293     if (d->phys_map.skip) {
 294         phys_page_compact(&d->phys_map, d->map.nodes, compacted);
 295     }
 296 }
 297
 298 static MemoryRegionSection *phys_page_find(PhysPageEntry lp, hwaddr addr,
 299                                            Node *nodes, MemoryRegionSection *sections)
 300 {
 301     PhysPageEntry *p;
 302     hwaddr index = addr >> TARGET_PAGE_BITS;
 303     int i;
 304
 305     for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
 306         if (lp.ptr == PHYS_MAP_NODE_NIL) {
 307             return &sections[PHYS_SECTION_UNASSIGNED];
 308         }
 309         p = nodes[lp.ptr];
 310         lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
 311     }
 312
 313     if (sections[lp.ptr].size.hi ||
 314         range_covers_byte(sections[lp.ptr].offset_within_address_space,
 315                           sections[lp.ptr].size.lo, addr)) {
 316         return &sections[lp.ptr];
 317     } else {
 318         return &sections[PHYS_SECTION_UNASSIGNED];
 319     }
 320 }
 321
 322 bool memory_region_is_unassigned(MemoryRegion *mr)
 323 {
 324     return mr != &io_mem_rom && mr != &io_mem_notdirty && !mr->rom_device
 325         && mr != &io_mem_watch;
 326 }
 327
 328 /* Called from RCU critical section */
 329 static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
 330                                                         hwaddr addr,
 331                                                         bool resolve_subpage)
 332 {
 333     MemoryRegionSection *section;
 334     subpage_t *subpage;
 335
 336     section = phys_page_find(d->phys_map, addr, d->map.nodes, d->map.sections);
 337     if (resolve_subpage && section->mr->subpage) {
 338         subpage = container_of(section->mr, subpage_t, iomem);
 339         section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
 340     }
 341     return section;
 342 }
 343
 344 /* Called from RCU critical section */
 345 static MemoryRegionSection *
 346 address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *xlat,
 347                                  hwaddr *plen, bool resolve_subpage)
 348 {
 349     MemoryRegionSection *section;
 350     Int128 diff;
 351
 352     section = address_space_lookup_region(d, addr, resolve_subpage);
 353     /* Compute offset within MemoryRegionSection */
 354     addr -= section->offset_within_address_space;
 355
 356     /* Compute offset within MemoryRegion */
 357     *xlat = addr + section->offset_within_region;
 358
 359     diff = int128_sub(section->mr->size, int128_make64(addr));
 360     *plen = int128_get64(int128_min(diff, int128_make64(*plen)));
 361     return section;
 362 }
 363
 364 static inline bool memory_access_is_direct(MemoryRegion *mr, bool is_write)
 365 {
 366     if (memory_region_is_ram(mr)) {
 367         return !(is_write && mr->readonly);
 368     }
 369     if (memory_region_is_romd(mr)) {
 370         return !is_write;
 371     }
 372
 373     return false;
 374 }
 375
 376 MemoryRegion *address_space_translate(AddressSpace *as, hwaddr addr,
 377                                       hwaddr *xlat, hwaddr *plen,
 378                                       bool is_write)
 379 {
 380     IOMMUTLBEntry iotlb;
 381     MemoryRegionSection *section;
 382     MemoryRegion *mr;
 383     hwaddr len = *plen;
 384
 385     rcu_read_lock();
 386     for (;;) {
 387         AddressSpaceDispatch *d = atomic_rcu_read(&as->dispatch);
 388         section = address_space_translate_internal(d, addr, &addr, plen, true);
 389         mr = section->mr;
 390
 391         if (!mr->iommu_ops) {
 392             break;
 393         }
 394
 395         iotlb = mr->iommu_ops->translate(mr, addr, is_write);
 396         addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
 397                 | (addr & iotlb.addr_mask));
 398         len = MIN(len, (addr | iotlb.addr_mask) - addr + 1);
 399         if (!(iotlb.perm & (1 << is_write))) {
 400             mr = &io_mem_unassigned;
 401             break;
 402         }
 403
 404         as = iotlb.target_as;
 405     }
 406
 407     if (xen_enabled() && memory_access_is_direct(mr, is_write)) {
 408         hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
 409         len = MIN(page, len);
 410     }
 411
 412     *plen = len;
 413     *xlat = addr;
 414     rcu_read_unlock();
 415     return mr;
 416 }
 417
 418 /* Called from RCU critical section */
 419 MemoryRegionSection *
 420 address_space_translate_for_iotlb(CPUState *cpu, hwaddr addr,
 421                                   hwaddr *xlat, hwaddr *plen)
 422 {
 423     MemoryRegionSection *section;
 424     section = address_space_translate_internal(cpu->memory_dispatch,
 425                                                addr, xlat, plen, false);
 426
 427     assert(!section->mr->iommu_ops);
 428     return section;
 429 }
 430 #endif
 431
 432 void cpu_exec_init_all(void)
 433 {
 434 #if !defined(CONFIG_USER_ONLY)
 435     qemu_mutex_init(&ram_list.mutex);
 436     memory_map_init();
 437     io_mem_init();
 438 #endif
 439 }
 440
 441 #if !defined(CONFIG_USER_ONLY)
 442
 443 static int cpu_common_post_load(void *opaque, int version_id)
 444 {
 445     CPUState *cpu = opaque;
 446
 447     /* 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the
 448        version_id is increased. */
 449     cpu->interrupt_request &= ~0x01;
 450     tlb_flush(cpu, 1);
 451
 452     return 0;
 453 }
 454
 455 static int cpu_common_pre_load(void *opaque)
 456 {
 457     CPUState *cpu = opaque;
 458
 459     cpu->exception_index = -1;
 460
 461     return 0;
 462 }
 463
 464 static bool cpu_common_exception_index_needed(void *opaque)
 465 {
 466     CPUState *cpu = opaque;
 467
 468     return tcg_enabled() && cpu->exception_index != -1;
 469 }
 470
 471 static const VMStateDescription vmstate_cpu_common_exception_index = {
 472     .name = "cpu_common/exception_index",
 473     .version_id = 1,
 474     .minimum_version_id = 1,
 475     .fields = (VMStateField[]) {
 476         VMSTATE_INT32(exception_index, CPUState),
 477         VMSTATE_END_OF_LIST()
 478     }
 479 };
 480
 481 const VMStateDescription vmstate_cpu_common = {
 482     .name = "cpu_common",
 483     .version_id = 1,
 484     .minimum_version_id = 1,
 485     .pre_load = cpu_common_pre_load,
 486     .post_load = cpu_common_post_load,
 487     .fields = (VMStateField[]) {
 488         VMSTATE_UINT32(halted, CPUState),
 489         VMSTATE_UINT32(interrupt_request, CPUState),
 490         VMSTATE_END_OF_LIST()
 491     },
 492     .subsections = (VMStateSubsection[]) {
 493         {
 494             .vmsd = &vmstate_cpu_common_exception_index,
 495             .needed = cpu_common_exception_index_needed,
 496         } , {
 497             /* empty */
 498         }
 499     }
 500 };
 501
 502 #endif
 503
 504 CPUState *qemu_get_cpu(int index)
 505 {
 506     CPUState *cpu;
 507
 508     CPU_FOREACH(cpu) {
 509         if (cpu->cpu_index == index) {
 510             return cpu;
 511         }
 512     }
 513
 514     return NULL;
 515 }
 516
 517 #if !defined(CONFIG_USER_ONLY)
 518 void tcg_cpu_address_space_init(CPUState *cpu, AddressSpace *as)
 519 {
 520     /* We only support one address space per cpu at the moment.  */
 521     assert(cpu->as == as);
 522
 523     if (cpu->tcg_as_listener) {
 524         memory_listener_unregister(cpu->tcg_as_listener);
 525     } else {
 526         cpu->tcg_as_listener = g_new0(MemoryListener, 1);
 527     }
 528     cpu->tcg_as_listener->commit = tcg_commit;
 529     memory_listener_register(cpu->tcg_as_listener, as);
 530 }
 531 #endif
 532
 533 void cpu_exec_init(CPUArchState *env)
 534 {
 535     CPUState *cpu = ENV_GET_CPU(env);
 536     CPUClass *cc = CPU_GET_CLASS(cpu);
 537     CPUState *some_cpu;
 538     int cpu_index;
 539
 540 #if defined(CONFIG_USER_ONLY)
 541     cpu_list_lock();
 542 #endif
 543     cpu_index = 0;
 544     CPU_FOREACH(some_cpu) {
 545         cpu_index++;
 546     }
 547     cpu->cpu_index = cpu_index;
 548     cpu->numa_node = 0;
 549     QTAILQ_INIT(&cpu->breakpoints);
 550     QTAILQ_INIT(&cpu->watchpoints);
 551 #ifndef CONFIG_USER_ONLY
 552     cpu->as = &address_space_memory;
 553     cpu->thread_id = qemu_get_thread_id();
 554     cpu_reload_memory_map(cpu);
 555 #endif
 556     QTAILQ_INSERT_TAIL(&cpus, cpu, node);
 557 #if defined(CONFIG_USER_ONLY)
 558     cpu_list_unlock();
 559 #endif
 560     if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
 561         vmstate_register(NULL, cpu_index, &vmstate_cpu_common, cpu);
 562     }
 563 #if defined(CPU_SAVE_VERSION) && !defined(CONFIG_USER_ONLY)
 564     register_savevm(NULL, "cpu", cpu_index, CPU_SAVE_VERSION,
 565                     cpu_save, cpu_load, env);
 566     assert(cc->vmsd == NULL);
 567     assert(qdev_get_vmsd(DEVICE(cpu)) == NULL);
 568 #endif
 569     if (cc->vmsd != NULL) {
 570         vmstate_register(NULL, cpu_index, cc->vmsd, cpu);
 571     }
 572 }
 573
 574 #if defined(CONFIG_USER_ONLY)
 575 static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
 576 {
 577     tb_invalidate_phys_page_range(pc, pc + 1, 0);
 578 }
 579 #else
 580 static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
 581 {
 582     hwaddr phys = cpu_get_phys_page_debug(cpu, pc);
 583     if (phys != -1) {
 584         tb_invalidate_phys_addr(cpu->as,
 585                                 phys | (pc & ~TARGET_PAGE_MASK));
 586     }
 587 }
 588 #endif
 589
 590 #if defined(CONFIG_USER_ONLY)
 591 void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
 592
 593 {
 594 }
 595
 596 int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
 597                           int flags)
 598 {
 599     return -ENOSYS;
 600 }
 601
 602 void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
 603 {
 604 }
 605
 606 int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
 607                           int flags, CPUWatchpoint **watchpoint)
 608 {
 609     return -ENOSYS;
 610 }
 611 #else
 612 /* Add a watchpoint.  */
 613 int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
 614                           int flags, CPUWatchpoint **watchpoint)
 615 {
 616     CPUWatchpoint *wp;
 617
 618     /* forbid ranges which are empty or run off the end of the address space */
 619     if (len == 0 || (addr + len - 1) < addr) {
 620         error_report("tried to set invalid watchpoint at %"
 621                      VADDR_PRIx ", len=%" VADDR_PRIu, addr, len);
 622         return -EINVAL;
 623     }
 624     wp = g_malloc(sizeof(*wp));
 625
 626     wp->vaddr = addr;
 627     wp->len = len;
 628     wp->flags = flags;
 629
 630     /* keep all GDB-injected watchpoints in front */
 631     if (flags & BP_GDB) {
 632         QTAILQ_INSERT_HEAD(&cpu->watchpoints, wp, entry);
 633     } else {
 634         QTAILQ_INSERT_TAIL(&cpu->watchpoints, wp, entry);
 635     }
 636
 637     tlb_flush_page(cpu, addr);
 638
 639     if (watchpoint)
 640         *watchpoint = wp;
 641     return 0;
 642 }
 643
 644 /* Remove a specific watchpoint.  */
 645 int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
 646                           int flags)
 647 {
 648     CPUWatchpoint *wp;
 649
 650     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
 651         if (addr == wp->vaddr && len == wp->len
 652                 && flags == (wp->flags & ~BP_WATCHPOINT_HIT)) {
 653             cpu_watchpoint_remove_by_ref(cpu, wp);
 654             return 0;
 655         }
 656     }
 657     return -ENOENT;
 658 }
 659
 660 /* Remove a specific watchpoint by reference.  */
 661 void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
 662 {
 663     QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry);
 664
 665     tlb_flush_page(cpu, watchpoint->vaddr);
 666
 667     g_free(watchpoint);
 668 }
 669
 670 /* Remove all matching watchpoints.  */
 671 void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
 672 {
 673     CPUWatchpoint *wp, *next;
 674
 675     QTAILQ_FOREACH_SAFE(wp, &cpu->watchpoints, entry, next) {
 676         if (wp->flags & mask) {
 677             cpu_watchpoint_remove_by_ref(cpu, wp);
 678         }
 679     }
 680 }
 681
 682 /* Return true if this watchpoint address matches the specified
 683  * access (ie the address range covered by the watchpoint overlaps
 684  * partially or completely with the address range covered by the
 685  * access).
 686  */
 687 static inline bool cpu_watchpoint_address_matches(CPUWatchpoint *wp,
 688                                                   vaddr addr,
 689                                                   vaddr len)
 690 {
 691     /* We know the lengths are non-zero, but a little caution is
 692      * required to avoid errors in the case where the range ends
 693      * exactly at the top of the address space and so addr + len
 694      * wraps round to zero.
 695      */
 696     vaddr wpend = wp->vaddr + wp->len - 1;
 697     vaddr addrend = addr + len - 1;
 698
 699     return !(addr > wpend || wp->vaddr > addrend);
 700 }
 701
 702 #endif
 703
 704 /* Add a breakpoint.  */
 705 int cpu_breakpoint_insert(CPUState *cpu, vaddr pc, int flags,
 706                           CPUBreakpoint **breakpoint)
 707 {
 708     CPUBreakpoint *bp;
 709
 710     bp = g_malloc(sizeof(*bp));
 711
 712     bp->pc = pc;
 713     bp->flags = flags;
 714
 715     /* keep all GDB-injected breakpoints in front */
 716     if (flags & BP_GDB) {
 717         QTAILQ_INSERT_HEAD(&cpu->breakpoints, bp, entry);
 718     } else {
 719         QTAILQ_INSERT_TAIL(&cpu->breakpoints, bp, entry);
 720     }
 721
 722     breakpoint_invalidate(cpu, pc);
 723
 724     if (breakpoint) {
 725         *breakpoint = bp;
 726     }
 727     return 0;
 728 }
 729
 730 /* Remove a specific breakpoint.  */
 731 int cpu_breakpoint_remove(CPUState *cpu, vaddr pc, int flags)
 732 {
 733     CPUBreakpoint *bp;
 734
 735     QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
 736         if (bp->pc == pc && bp->flags == flags) {
 737             cpu_breakpoint_remove_by_ref(cpu, bp);
 738             return 0;
 739         }
 740     }
 741     return -ENOENT;
 742 }
 743
 744 /* Remove a specific breakpoint by reference.  */
 745 void cpu_breakpoint_remove_by_ref(CPUState *cpu, CPUBreakpoint *breakpoint)
 746 {
 747     QTAILQ_REMOVE(&cpu->breakpoints, breakpoint, entry);
 748
 749     breakpoint_invalidate(cpu, breakpoint->pc);
 750
 751     g_free(breakpoint);
 752 }
 753
 754 /* Remove all matching breakpoints. */
 755 void cpu_breakpoint_remove_all(CPUState *cpu, int mask)
 756 {
 757     CPUBreakpoint *bp, *next;
 758
 759     QTAILQ_FOREACH_SAFE(bp, &cpu->breakpoints, entry, next) {
 760         if (bp->flags & mask) {
 761             cpu_breakpoint_remove_by_ref(cpu, bp);
 762         }
 763     }
 764 }
 765
 766 /* enable or disable single step mode. EXCP_DEBUG is returned by the
 767    CPU loop after each instruction */
 768 void cpu_single_step(CPUState *cpu, int enabled)
 769 {
 770     if (cpu->singlestep_enabled != enabled) {
 771         cpu->singlestep_enabled = enabled;
 772         if (kvm_enabled()) {
 773             kvm_update_guest_debug(cpu, 0);
 774         } else {
 775             /* must flush all the translated code to avoid inconsistencies */
 776             /* XXX: only flush what is necessary */
 777             CPUArchState *env = cpu->env_ptr;
 778             tb_flush(env);
 779         }
 780     }
 781 }
 782
 783 void cpu_abort(CPUState *cpu, const char *fmt, ...)
 784 {
 785     va_list ap;
 786     va_list ap2;
 787
 788     va_start(ap, fmt);
 789     va_copy(ap2, ap);
 790     fprintf(stderr, "qemu: fatal: ");
 791     vfprintf(stderr, fmt, ap);
 792     fprintf(stderr, "\n");
 793     cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU | CPU_DUMP_CCOP);
 794     if (qemu_log_enabled()) {
 795         qemu_log("qemu: fatal: ");
 796         qemu_log_vprintf(fmt, ap2);
 797         qemu_log("\n");
 798         log_cpu_state(cpu, CPU_DUMP_FPU | CPU_DUMP_CCOP);
 799         qemu_log_flush();
 800         qemu_log_close();
 801     }
 802     va_end(ap2);
 803     va_end(ap);
 804 #if defined(CONFIG_USER_ONLY)
 805     {
 806         struct sigaction act;
 807         sigfillset(&act.sa_mask);
 808         act.sa_handler = SIG_DFL;
 809         sigaction(SIGABRT, &act, NULL);
 810     }
 811 #endif
 812     abort();
 813 }
 814
 815 #if !defined(CONFIG_USER_ONLY)
 816 /* Called from RCU critical section */
 817 static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
 818 {
 819     RAMBlock *block;
 820
 821     block = atomic_rcu_read(&ram_list.mru_block);
 822     if (block && addr - block->offset < block->max_length) {
 823         goto found;
 824     }
 825     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 826         if (addr - block->offset < block->max_length) {
 827             goto found;
 828         }
 829     }
 830
 831     fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
 832     abort();
 833
 834 found:
 835     /* It is safe to write mru_block outside the iothread lock.  This
 836      * is what happens:
 837      *
 838      *     mru_block = xxx
 839      *     rcu_read_unlock()
 840      *                                        xxx removed from list
 841      *                  rcu_read_lock()
 842      *                  read mru_block
 843      *                                        mru_block = NULL;
 844      *                                        call_rcu(reclaim_ramblock, xxx);
 845      *                  rcu_read_unlock()
 846      *
 847      * atomic_rcu_set is not needed here.  The block was already published
 848      * when it was placed into the list.  Here we're just making an extra
 849      * copy of the pointer.
 850      */
 851     ram_list.mru_block = block;
 852     return block;
 853 }
 854
 855 static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
 856 {
 857     ram_addr_t start1;
 858     RAMBlock *block;
 859     ram_addr_t end;
 860
 861     end = TARGET_PAGE_ALIGN(start + length);
 862     start &= TARGET_PAGE_MASK;
 863
 864     rcu_read_lock();
 865     block = qemu_get_ram_block(start);
 866     assert(block == qemu_get_ram_block(end - 1));
 867     start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
 868     cpu_tlb_reset_dirty_all(start1, length);
 869     rcu_read_unlock();
 870 }
 871
 872 /* Note: start and end must be within the same ram block.  */
 873 void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t length,
 874                                      unsigned client)
 875 {
 876     if (length == 0)
 877         return;
 878     cpu_physical_memory_clear_dirty_range_type(start, length, client);
 879
 880     if (tcg_enabled()) {
 881         tlb_reset_dirty_range_all(start, length);
 882     }
 883 }
 884
 885 static void cpu_physical_memory_set_dirty_tracking(bool enable)
 886 {
 887     in_migration = enable;
 888 }
 889
 890 /* Called from RCU critical section */
 891 hwaddr memory_region_section_get_iotlb(CPUState *cpu,
 892                                        MemoryRegionSection *section,
 893                                        target_ulong vaddr,
 894                                        hwaddr paddr, hwaddr xlat,
 895                                        int prot,
 896                                        target_ulong *address)
 897 {
 898     hwaddr iotlb;
 899     CPUWatchpoint *wp;
 900
 901     if (memory_region_is_ram(section->mr)) {
 902         /* Normal RAM.  */
 903         iotlb = (memory_region_get_ram_addr(section->mr) & TARGET_PAGE_MASK)
 904             + xlat;
 905         if (!section->readonly) {
 906             iotlb |= PHYS_SECTION_NOTDIRTY;
 907         } else {
 908             iotlb |= PHYS_SECTION_ROM;
 909         }
 910     } else {
 911         iotlb = section - section->address_space->dispatch->map.sections;
 912         iotlb += xlat;
 913     }
 914
 915     /* Make accesses to pages with watchpoints go via the
 916        watchpoint trap routines.  */
 917     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
 918         if (cpu_watchpoint_address_matches(wp, vaddr, TARGET_PAGE_SIZE)) {
 919             /* Avoid trapping reads of pages with a write breakpoint. */
 920             if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) {
 921                 iotlb = PHYS_SECTION_WATCH + paddr;
 922                 *address |= TLB_MMIO;
 923                 break;
 924             }
 925         }
 926     }
 927
 928     return iotlb;
 929 }
 930 #endif /* defined(CONFIG_USER_ONLY) */
 931
 932 #if !defined(CONFIG_USER_ONLY)
 933
 934 static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
 935                              uint16_t section);
 936 static subpage_t *subpage_init(AddressSpace *as, hwaddr base);
 937
 938 static void *(*phys_mem_alloc)(size_t size, uint64_t *align) =
 939                                qemu_anon_ram_alloc;
 940
 941 /*
 942  * Set a custom physical guest memory alloator.
 943  * Accelerators with unusual needs may need this.  Hopefully, we can
 944  * get rid of it eventually.
 945  */
 946 void phys_mem_set_alloc(void *(*alloc)(size_t, uint64_t *align))
 947 {
 948     phys_mem_alloc = alloc;
 949 }
 950
 951 static uint16_t phys_section_add(PhysPageMap *map,
 952                                  MemoryRegionSection *section)
 953 {
 954     /* The physical section number is ORed with a page-aligned
 955      * pointer to produce the iotlb entries.  Thus it should
 956      * never overflow into the page-aligned value.
 957      */
 958     assert(map->sections_nb < TARGET_PAGE_SIZE);
 959
 960     if (map->sections_nb == map->sections_nb_alloc) {
 961         map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
 962         map->sections = g_renew(MemoryRegionSection, map->sections,
 963                                 map->sections_nb_alloc);
 964     }
 965     map->sections[map->sections_nb] = *section;
 966     memory_region_ref(section->mr);
 967     return map->sections_nb++;
 968 }
 969
 970 static void phys_section_destroy(MemoryRegion *mr)
 971 {
 972     memory_region_unref(mr);
 973
 974     if (mr->subpage) {
 975         subpage_t *subpage = container_of(mr, subpage_t, iomem);
 976         object_unref(OBJECT(&subpage->iomem));
 977         g_free(subpage);
 978     }
 979 }
 980
 981 static void phys_sections_free(PhysPageMap *map)
 982 {
 983     while (map->sections_nb > 0) {
 984         MemoryRegionSection *section = &map->sections[--map->sections_nb];
 985         phys_section_destroy(section->mr);
 986     }
 987     g_free(map->sections);
 988     g_free(map->nodes);
 989 }
 990
 991 static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *section)
 992 {
 993     subpage_t *subpage;
 994     hwaddr base = section->offset_within_address_space
 995         & TARGET_PAGE_MASK;
 996     MemoryRegionSection *existing = phys_page_find(d->phys_map, base,
 997                                                    d->map.nodes, d->map.sections);
 998     MemoryRegionSection subsection = {
 999         .offset_within_address_space = base,
1000         .size = int128_make64(TARGET_PAGE_SIZE),
1001     };
1002     hwaddr start, end;
1003
1004     assert(existing->mr->subpage || existing->mr == &io_mem_unassigned);
1005
1006     if (!(existing->mr->subpage)) {
1007         subpage = subpage_init(d->as, base);
1008         subsection.address_space = d->as;
1009         subsection.mr = &subpage->iomem;
1010         phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
1011                       phys_section_add(&d->map, &subsection));
1012     } else {
1013         subpage = container_of(existing->mr, subpage_t, iomem);
1014     }
1015     start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
1016     end = start + int128_get64(section->size) - 1;
1017     subpage_register(subpage, start, end,
1018                      phys_section_add(&d->map, section));
1019 }
1020
1021
1022 static void register_multipage(AddressSpaceDispatch *d,
1023                                MemoryRegionSection *section)
1024 {
1025     hwaddr start_addr = section->offset_within_address_space;
1026     uint16_t section_index = phys_section_add(&d->map, section);
1027     uint64_t num_pages = int128_get64(int128_rshift(section->size,
1028                                                     TARGET_PAGE_BITS));
1029
1030     assert(num_pages);
1031     phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
1032 }
1033
1034 static void mem_add(MemoryListener *listener, MemoryRegionSection *section)
1035 {
1036     AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
1037     AddressSpaceDispatch *d = as->next_dispatch;
1038     MemoryRegionSection now = *section, remain = *section;
1039     Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
1040
1041     if (now.offset_within_address_space & ~TARGET_PAGE_MASK) {
1042         uint64_t left = TARGET_PAGE_ALIGN(now.offset_within_address_space)
1043                        - now.offset_within_address_space;
1044
1045         now.size = int128_min(int128_make64(left), now.size);
1046         register_subpage(d, &now);
1047     } else {
1048         now.size = int128_zero();
1049     }
1050     while (int128_ne(remain.size, now.size)) {
1051         remain.size = int128_sub(remain.size, now.size);
1052         remain.offset_within_address_space += int128_get64(now.size);
1053         remain.offset_within_region += int128_get64(now.size);
1054         now = remain;
1055         if (int128_lt(remain.size, page_size)) {
1056             register_subpage(d, &now);
1057         } else if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
1058             now.size = page_size;
1059             register_subpage(d, &now);
1060         } else {
1061             now.size = int128_and(now.size, int128_neg(page_size));
1062             register_multipage(d, &now);
1063         }
1064     }
1065 }
1066
1067 void qemu_flush_coalesced_mmio_buffer(void)
1068 {
1069     if (kvm_enabled())
1070         kvm_flush_coalesced_mmio_buffer();
1071 }
1072
1073 void qemu_mutex_lock_ramlist(void)
1074 {
1075     qemu_mutex_lock(&ram_list.mutex);
1076 }
1077
1078 void qemu_mutex_unlock_ramlist(void)
1079 {
1080     qemu_mutex_unlock(&ram_list.mutex);
1081 }
1082
1083 #ifdef __linux__
1084
1085 #include <sys/vfs.h>
1086
1087 #define HUGETLBFS_MAGIC       0x958458f6
1088
1089 static long gethugepagesize(const char *path, Error **errp)
1090 {
1091     struct statfs fs;
1092     int ret;
1093
1094     do {
1095         ret = statfs(path, &fs);
1096     } while (ret != 0 && errno == EINTR);
1097
1098     if (ret != 0) {
1099         error_setg_errno(errp, errno, "failed to get page size of file %s",
1100                          path);
1101         return 0;
1102     }
1103
1104     if (fs.f_type != HUGETLBFS_MAGIC)
1105         fprintf(stderr, "Warning: path not on HugeTLBFS: %s\n", path);
1106
1107     return fs.f_bsize;
1108 }
1109
1110 static void *file_ram_alloc(RAMBlock *block,
1111                             ram_addr_t memory,
1112                             const char *path,
1113                             Error **errp)
1114 {
1115     char *filename;
1116     char *sanitized_name;
1117     char *c;
1118     void *area = NULL;
1119     int fd;
1120     uint64_t hpagesize;
1121     Error *local_err = NULL;
1122
1123     hpagesize = gethugepagesize(path, &local_err);
1124     if (local_err) {
1125         error_propagate(errp, local_err);
1126         goto error;
1127     }
1128     block->mr->align = hpagesize;
1129
1130     if (memory < hpagesize) {
1131         error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
1132                    "or larger than huge page size 0x%" PRIx64,
1133                    memory, hpagesize);
1134         goto error;
1135     }
1136
1137     if (kvm_enabled() && !kvm_has_sync_mmu()) {
1138         error_setg(errp,
1139                    "host lacks kvm mmu notifiers, -mem-path unsupported");
1140         goto error;
1141     }
1142
1143     /* Make name safe to use with mkstemp by replacing '/' with '_'. */
1144     sanitized_name = g_strdup(memory_region_name(block->mr));
1145     for (c = sanitized_name; *c != '\0'; c++) {
1146         if (*c == '/')
1147             *c = '_';
1148     }
1149
1150     filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
1151                                sanitized_name);
1152     g_free(sanitized_name);
1153
1154     fd = mkstemp(filename);
1155     if (fd < 0) {
1156         error_setg_errno(errp, errno,
1157                          "unable to create backing store for hugepages");
1158         g_free(filename);
1159         goto error;
1160     }
1161     unlink(filename);
1162     g_free(filename);
1163
1164     memory = (memory+hpagesize-1) & ~(hpagesize-1);
1165
1166     /*
1167      * ftruncate is not supported by hugetlbfs in older
1168      * hosts, so don't bother bailing out on errors.
1169      * If anything goes wrong with it under other filesystems,
1170      * mmap will fail.
1171      */
1172     if (ftruncate(fd, memory)) {
1173         perror("ftruncate");
1174     }
1175
1176     area = mmap(0, memory, PROT_READ | PROT_WRITE,
1177                 (block->flags & RAM_SHARED ? MAP_SHARED : MAP_PRIVATE),
1178                 fd, 0);
1179     if (area == MAP_FAILED) {
1180         error_setg_errno(errp, errno,
1181                          "unable to map backing store for hugepages");
1182         close(fd);
1183         goto error;
1184     }
1185
1186     if (mem_prealloc) {
1187         os_mem_prealloc(fd, area, memory);
1188     }
1189
1190     block->fd = fd;
1191     return area;
1192
1193 error:
1194     if (mem_prealloc) {
1195         error_report("%s", error_get_pretty(*errp));
1196         exit(1);
1197     }
1198     return NULL;
1199 }
1200 #endif
1201
1202 /* Called with the ramlist lock held.  */
1203 static ram_addr_t find_ram_offset(ram_addr_t size)
1204 {
1205     RAMBlock *block, *next_block;
1206     ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
1207
1208     assert(size != 0); /* it would hand out same offset multiple times */
1209
1210     if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
1211         return 0;
1212     }
1213
1214     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1215         ram_addr_t end, next = RAM_ADDR_MAX;
1216
1217         end = block->offset + block->max_length;
1218
1219         QLIST_FOREACH_RCU(next_block, &ram_list.blocks, next) {
1220             if (next_block->offset >= end) {
1221                 next = MIN(next, next_block->offset);
1222             }
1223         }
1224         if (next - end >= size && next - end < mingap) {
1225             offset = end;
1226             mingap = next - end;
1227         }
1228     }
1229
1230     if (offset == RAM_ADDR_MAX) {
1231         fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
1232                 (uint64_t)size);
1233         abort();
1234     }
1235
1236     return offset;
1237 }
1238
1239 ram_addr_t last_ram_offset(void)
1240 {
1241     RAMBlock *block;
1242     ram_addr_t last = 0;
1243
1244     rcu_read_lock();
1245     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1246         last = MAX(last, block->offset + block->max_length);
1247     }
1248     rcu_read_unlock();
1249     return last;
1250 }
1251
1252 static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
1253 {
1254     int ret;
1255
1256     /* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
1257     if (!machine_dump_guest_core(current_machine)) {
1258         ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
1259         if (ret) {
1260             perror("qemu_madvise");
1261             fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
1262                             "but dump_guest_core=off specified\n");
1263         }
1264     }
1265 }
1266
1267 /* Called within an RCU critical section, or while the ramlist lock
1268  * is held.
1269  */
1270 static RAMBlock *find_ram_block(ram_addr_t addr)
1271 {
1272     RAMBlock *block;
1273
1274     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1275         if (block->offset == addr) {
1276             return block;
1277         }
1278     }
1279
1280     return NULL;
1281 }
1282
1283 /* Called with iothread lock held.  */
1284 void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev)
1285 {
1286     RAMBlock *new_block, *block;
1287
1288     rcu_read_lock();
1289     new_block = find_ram_block(addr);
1290     assert(new_block);
1291     assert(!new_block->idstr[0]);
1292
1293     if (dev) {
1294         char *id = qdev_get_dev_path(dev);
1295         if (id) {
1296             snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
1297             g_free(id);
1298         }
1299     }
1300     pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
1301
1302     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1303         if (block != new_block && !strcmp(block->idstr, new_block->idstr)) {
1304             fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
1305                     new_block->idstr);
1306             abort();
1307         }
1308     }
1309     rcu_read_unlock();
1310 }
1311
1312 /* Called with iothread lock held.  */
1313 void qemu_ram_unset_idstr(ram_addr_t addr)
1314 {
1315     RAMBlock *block;
1316
1317     /* FIXME: arch_init.c assumes that this is not called throughout
1318      * migration.  Ignore the problem since hot-unplug during migration
1319      * does not work anyway.
1320      */
1321
1322     rcu_read_lock();
1323     block = find_ram_block(addr);
1324     if (block) {
1325         memset(block->idstr, 0, sizeof(block->idstr));
1326     }
1327     rcu_read_unlock();
1328 }
1329
1330 static int memory_try_enable_merging(void *addr, size_t len)
1331 {
1332     if (!machine_mem_merge(current_machine)) {
1333         /* disabled by the user */
1334         return 0;
1335     }
1336
1337     return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
1338 }
1339
1340 /* Only legal before guest might have detected the memory size: e.g. on
1341  * incoming migration, or right after reset.
1342  *
1343  * As memory core doesn't know how is memory accessed, it is up to
1344  * resize callback to update device state and/or add assertions to detect
1345  * misuse, if necessary.
1346  */
1347 int qemu_ram_resize(ram_addr_t base, ram_addr_t newsize, Error **errp)
1348 {
1349     RAMBlock *block = find_ram_block(base);
1350
1351     assert(block);
1352
1353     newsize = TARGET_PAGE_ALIGN(newsize);
1354
1355     if (block->used_length == newsize) {
1356         return 0;
1357     }
1358
1359     if (!(block->flags & RAM_RESIZEABLE)) {
1360         error_setg_errno(errp, EINVAL,
1361                          "Length mismatch: %s: 0x" RAM_ADDR_FMT
1362                          " in != 0x" RAM_ADDR_FMT, block->idstr,
1363                          newsize, block->used_length);
1364         return -EINVAL;
1365     }
1366
1367     if (block->max_length < newsize) {
1368         error_setg_errno(errp, EINVAL,
1369                          "Length too large: %s: 0x" RAM_ADDR_FMT
1370                          " > 0x" RAM_ADDR_FMT, block->idstr,
1371                          newsize, block->max_length);
1372         return -EINVAL;
1373     }
1374
1375     cpu_physical_memory_clear_dirty_range(block->offset, block->used_length);
1376     block->used_length = newsize;
1377     cpu_physical_memory_set_dirty_range(block->offset, block->used_length);
1378     memory_region_set_size(block->mr, newsize);
1379     if (block->resized) {
1380         block->resized(block->idstr, newsize, block->host);
1381     }
1382     return 0;
1383 }
1384
1385 static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp)
1386 {
1387     RAMBlock *block;
1388     RAMBlock *last_block = NULL;
1389     ram_addr_t old_ram_size, new_ram_size;
1390
1391     old_ram_size = last_ram_offset() >> TARGET_PAGE_BITS;
1392
1393     qemu_mutex_lock_ramlist();
1394     new_block->offset = find_ram_offset(new_block->max_length);
1395
1396     if (!new_block->host) {
1397         if (xen_enabled()) {
1398             xen_ram_alloc(new_block->offset, new_block->max_length,
1399                           new_block->mr);
1400         } else {
1401             new_block->host = phys_mem_alloc(new_block->max_length,
1402                                              &new_block->mr->align);
1403             if (!new_block->host) {
1404                 error_setg_errno(errp, errno,
1405                                  "cannot set up guest memory '%s'",
1406                                  memory_region_name(new_block->mr));
1407                 qemu_mutex_unlock_ramlist();
1408                 return -1;
1409             }
1410             memory_try_enable_merging(new_block->host, new_block->max_length);
1411         }
1412     }
1413
1414     /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
1415      * QLIST (which has an RCU-friendly variant) does not have insertion at
1416      * tail, so save the last element in last_block.
1417      */
1418     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1419         last_block = block;
1420         if (block->max_length < new_block->max_length) {
1421             break;
1422         }
1423     }
1424     if (block) {
1425         QLIST_INSERT_BEFORE_RCU(block, new_block, next);
1426     } else if (last_block) {
1427         QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
1428     } else { /* list is empty */
1429         QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
1430     }
1431     ram_list.mru_block = NULL;
1432
1433     /* Write list before version */
1434     smp_wmb();
1435     ram_list.version++;
1436     qemu_mutex_unlock_ramlist();
1437
1438     new_ram_size = last_ram_offset() >> TARGET_PAGE_BITS;
1439
1440     if (new_ram_size > old_ram_size) {
1441         int i;
1442
1443         /* ram_list.dirty_memory[] is protected by the iothread lock.  */
1444         for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
1445             ram_list.dirty_memory[i] =
1446                 bitmap_zero_extend(ram_list.dirty_memory[i],
1447                                    old_ram_size, new_ram_size);
1448        }
1449     }
1450     cpu_physical_memory_set_dirty_range(new_block->offset,
1451                                         new_block->used_length);
1452
1453     if (new_block->host) {
1454         qemu_ram_setup_dump(new_block->host, new_block->max_length);
1455         qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
1456         qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_DONTFORK);
1457         if (kvm_enabled()) {
1458             kvm_setup_guest_memory(new_block->host, new_block->max_length);
1459         }
1460     }
1461
1462     return new_block->offset;
1463 }
1464
1465 #ifdef __linux__
1466 ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
1467                                     bool share, const char *mem_path,
1468                                     Error **errp)
1469 {
1470     RAMBlock *new_block;
1471     ram_addr_t addr;
1472     Error *local_err = NULL;
1473
1474     if (xen_enabled()) {
1475         error_setg(errp, "-mem-path not supported with Xen");
1476         return -1;
1477     }
1478
1479     if (phys_mem_alloc != qemu_anon_ram_alloc) {
1480         /*
1481          * file_ram_alloc() needs to allocate just like
1482          * phys_mem_alloc, but we haven't bothered to provide
1483          * a hook there.
1484          */
1485         error_setg(errp,
1486                    "-mem-path not supported with this accelerator");
1487         return -1;
1488     }
1489
1490     size = TARGET_PAGE_ALIGN(size);
1491     new_block = g_malloc0(sizeof(*new_block));
1492     new_block->mr = mr;
1493     new_block->used_length = size;
1494     new_block->max_length = size;
1495     new_block->flags = share ? RAM_SHARED : 0;
1496     new_block->host = file_ram_alloc(new_block, size,
1497                                      mem_path, errp);
1498     if (!new_block->host) {
1499         g_free(new_block);
1500         return -1;
1501     }
1502
1503     addr = ram_block_add(new_block, &local_err);
1504     if (local_err) {
1505         g_free(new_block);
1506         error_propagate(errp, local_err);
1507         return -1;
1508     }
1509     return addr;
1510 }
1511 #endif
1512
1513 static
1514 ram_addr_t qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
1515                                    void (*resized)(const char*,
1516                                                    uint64_t length,
1517                                                    void *host),
1518                                    void *host, bool resizeable,
1519                                    MemoryRegion *mr, Error **errp)
1520 {
1521     RAMBlock *new_block;
1522     ram_addr_t addr;
1523     Error *local_err = NULL;
1524
1525     size = TARGET_PAGE_ALIGN(size);
1526     max_size = TARGET_PAGE_ALIGN(max_size);
1527     new_block = g_malloc0(sizeof(*new_block));
1528     new_block->mr = mr;
1529     new_block->resized = resized;
1530     new_block->used_length = size;
1531     new_block->max_length = max_size;
1532     assert(max_size >= size);
1533     new_block->fd = -1;
1534     new_block->host = host;
1535     if (host) {
1536         new_block->flags |= RAM_PREALLOC;
1537     }
1538     if (resizeable) {
1539         new_block->flags |= RAM_RESIZEABLE;
1540     }
1541     addr = ram_block_add(new_block, &local_err);
1542     if (local_err) {
1543         g_free(new_block);
1544         error_propagate(errp, local_err);
1545         return -1;
1546     }
1547     return addr;
1548 }
1549
1550 ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
1551                                    MemoryRegion *mr, Error **errp)
1552 {
1553     return qemu_ram_alloc_internal(size, size, NULL, host, false, mr, errp);
1554 }
1555
1556 ram_addr_t qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr, Error **errp)
1557 {
1558     return qemu_ram_alloc_internal(size, size, NULL, NULL, false, mr, errp);
1559 }
1560
1561 ram_addr_t qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
1562                                      void (*resized)(const char*,
1563                                                      uint64_t length,
1564                                                      void *host),
1565                                      MemoryRegion *mr, Error **errp)
1566 {
1567     return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true, mr, errp);
1568 }
1569
1570 void qemu_ram_free_from_ptr(ram_addr_t addr)
1571 {
1572     RAMBlock *block;
1573
1574     qemu_mutex_lock_ramlist();
1575     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1576         if (addr == block->offset) {
1577             QLIST_REMOVE_RCU(block, next);
1578             ram_list.mru_block = NULL;
1579             /* Write list before version */
1580             smp_wmb();
1581             ram_list.version++;
1582             g_free_rcu(block, rcu);
1583             break;
1584         }
1585     }
1586     qemu_mutex_unlock_ramlist();
1587 }
1588
1589 static void reclaim_ramblock(RAMBlock *block)
1590 {
1591     if (block->flags & RAM_PREALLOC) {
1592         ;
1593     } else if (xen_enabled()) {
1594         xen_invalidate_map_cache_entry(block->host);
1595 #ifndef _WIN32
1596     } else if (block->fd >= 0) {
1597         munmap(block->host, block->max_length);
1598         close(block->fd);
1599 #endif
1600     } else {
1601         qemu_anon_ram_free(block->host, block->max_length);
1602     }
1603     g_free(block);
1604 }
1605
1606 void qemu_ram_free(ram_addr_t addr)
1607 {
1608     RAMBlock *block;
1609
1610     qemu_mutex_lock_ramlist();
1611     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1612         if (addr == block->offset) {
1613             QLIST_REMOVE_RCU(block, next);
1614             ram_list.mru_block = NULL;
1615             /* Write list before version */
1616             smp_wmb();
1617             ram_list.version++;
1618             call_rcu(block, reclaim_ramblock, rcu);
1619             break;
1620         }
1621     }
1622     qemu_mutex_unlock_ramlist();
1623 }
1624
1625 #ifndef _WIN32
1626 void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
1627 {
1628     RAMBlock *block;
1629     ram_addr_t offset;
1630     int flags;
1631     void *area, *vaddr;
1632
1633     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1634         offset = addr - block->offset;
1635         if (offset < block->max_length) {
1636             vaddr = ramblock_ptr(block, offset);
1637             if (block->flags & RAM_PREALLOC) {
1638                 ;
1639             } else if (xen_enabled()) {
1640                 abort();
1641             } else {
1642                 flags = MAP_FIXED;
1643                 if (block->fd >= 0) {
1644                     flags |= (block->flags & RAM_SHARED ?
1645                               MAP_SHARED : MAP_PRIVATE);
1646                     area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
1647                                 flags, block->fd, offset);
1648                 } else {
1649                     /*
1650                      * Remap needs to match alloc.  Accelerators that
1651                      * set phys_mem_alloc never remap.  If they did,
1652                      * we'd need a remap hook here.
1653                      */
1654                     assert(phys_mem_alloc == qemu_anon_ram_alloc);
1655
1656                     flags |= MAP_PRIVATE | MAP_ANONYMOUS;
1657                     area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
1658                                 flags, -1, 0);
1659                 }
1660                 if (area != vaddr) {
1661                     fprintf(stderr, "Could not remap addr: "
1662                             RAM_ADDR_FMT "@" RAM_ADDR_FMT "\n",
1663                             length, addr);
1664                     exit(1);
1665                 }
1666                 memory_try_enable_merging(vaddr, length);
1667                 qemu_ram_setup_dump(vaddr, length);
1668             }
1669         }
1670     }
1671 }
1672 #endif /* !_WIN32 */
1673
1674 int qemu_get_ram_fd(ram_addr_t addr)
1675 {
1676     RAMBlock *block;
1677     int fd;
1678
1679     rcu_read_lock();
1680     block = qemu_get_ram_block(addr);
1681     fd = block->fd;
1682     rcu_read_unlock();
1683     return fd;
1684 }
1685
1686 void *qemu_get_ram_block_host_ptr(ram_addr_t addr)
1687 {
1688     RAMBlock *block;
1689     void *ptr;
1690
1691     rcu_read_lock();
1692     block = qemu_get_ram_block(addr);
1693     ptr = ramblock_ptr(block, 0);
1694     rcu_read_unlock();
1695     return ptr;
1696 }
1697
1698 /* Return a host pointer to ram allocated with qemu_ram_alloc.
1699  * This should not be used for general purpose DMA.  Use address_space_map
1700  * or address_space_rw instead. For local memory (e.g. video ram) that the
1701  * device owns, use memory_region_get_ram_ptr.
1702  *
1703  * By the time this function returns, the returned pointer is not protected
1704  * by RCU anymore.  If the caller is not within an RCU critical section and
1705  * does not hold the iothread lock, it must have other means of protecting the
1706  * pointer, such as a reference to the region that includes the incoming
1707  * ram_addr_t.
1708  */
1709 void *qemu_get_ram_ptr(ram_addr_t addr)
1710 {
1711     RAMBlock *block;
1712     void *ptr;
1713
1714     rcu_read_lock();
1715     block = qemu_get_ram_block(addr);
1716
1717     if (xen_enabled() && block->host == NULL) {
1718         /* We need to check if the requested address is in the RAM
1719          * because we don't want to map the entire memory in QEMU.
1720          * In that case just map until the end of the page.
1721          */
1722         if (block->offset == 0) {
1723             ptr = xen_map_cache(addr, 0, 0);
1724             goto unlock;
1725         }
1726
1727         block->host = xen_map_cache(block->offset, block->max_length, 1);
1728     }
1729     ptr = ramblock_ptr(block, addr - block->offset);
1730
1731 unlock:
1732     rcu_read_unlock();
1733     return ptr;
1734 }
1735
1736 /* Return a host pointer to guest's ram. Similar to qemu_get_ram_ptr
1737  * but takes a size argument.
1738  *
1739  * By the time this function returns, the returned pointer is not protected
1740  * by RCU anymore.  If the caller is not within an RCU critical section and
1741  * does not hold the iothread lock, it must have other means of protecting the
1742  * pointer, such as a reference to the region that includes the incoming
1743  * ram_addr_t.
1744  */
1745 static void *qemu_ram_ptr_length(ram_addr_t addr, hwaddr *size)
1746 {
1747     void *ptr;
1748     if (*size == 0) {
1749         return NULL;
1750     }
1751     if (xen_enabled()) {
1752         return xen_map_cache(addr, *size, 1);
1753     } else {
1754         RAMBlock *block;
1755         rcu_read_lock();
1756         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1757             if (addr - block->offset < block->max_length) {
1758                 if (addr - block->offset + *size > block->max_length)
1759                     *size = block->max_length - addr + block->offset;
1760                 ptr = ramblock_ptr(block, addr - block->offset);
1761                 rcu_read_unlock();
1762                 return ptr;
1763             }
1764         }
1765
1766         fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
1767         abort();
1768     }
1769 }
1770
1771 /* Some of the softmmu routines need to translate from a host pointer
1772  * (typically a TLB entry) back to a ram offset.
1773  *
1774  * By the time this function returns, the returned pointer is not protected
1775  * by RCU anymore.  If the caller is not within an RCU critical section and
1776  * does not hold the iothread lock, it must have other means of protecting the
1777  * pointer, such as a reference to the region that includes the incoming
1778  * ram_addr_t.
1779  */
1780 MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
1781 {
1782     RAMBlock *block;
1783     uint8_t *host = ptr;
1784     MemoryRegion *mr;
1785
1786     if (xen_enabled()) {
1787         rcu_read_lock();
1788         *ram_addr = xen_ram_addr_from_mapcache(ptr);
1789         mr = qemu_get_ram_block(*ram_addr)->mr;
1790         rcu_read_unlock();
1791         return mr;
1792     }
1793
1794     rcu_read_lock();
1795     block = atomic_rcu_read(&ram_list.mru_block);
1796     if (block && block->host && host - block->host < block->max_length) {
1797         goto found;
1798     }
1799
1800     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1801         /* This case append when the block is not mapped. */
1802         if (block->host == NULL) {
1803             continue;
1804         }
1805         if (host - block->host < block->max_length) {
1806             goto found;
1807         }
1808     }
1809
1810     rcu_read_unlock();
1811     return NULL;
1812
1813 found:
1814     *ram_addr = block->offset + (host - block->host);
1815     mr = block->mr;
1816     rcu_read_unlock();
1817     return mr;
1818 }
1819
1820 static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
1821                                uint64_t val, unsigned size)
1822 {
1823     if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
1824         tb_invalidate_phys_page_fast(ram_addr, size);
1825     }
1826     switch (size) {
1827     case 1:
1828         stb_p(qemu_get_ram_ptr(ram_addr), val);
1829         break;
1830     case 2:
1831         stw_p(qemu_get_ram_ptr(ram_addr), val);
1832         break;
1833     case 4:
1834         stl_p(qemu_get_ram_ptr(ram_addr), val);
1835         break;
1836     default:
1837         abort();
1838     }
1839     cpu_physical_memory_set_dirty_range_nocode(ram_addr, size);
1840     /* we remove the notdirty callback only if the code has been
1841        flushed */
1842     if (!cpu_physical_memory_is_clean(ram_addr)) {
1843         CPUArchState *env = current_cpu->env_ptr;
1844         tlb_set_dirty(env, current_cpu->mem_io_vaddr);
1845     }
1846 }
1847
1848 static bool notdirty_mem_accepts(void *opaque, hwaddr addr,
1849                                  unsigned size, bool is_write)
1850 {
1851     return is_write;
1852 }
1853
1854 static const MemoryRegionOps notdirty_mem_ops = {
1855     .write = notdirty_mem_write,
1856     .valid.accepts = notdirty_mem_accepts,
1857     .endianness = DEVICE_NATIVE_ENDIAN,
1858 };
1859
1860 /* Generate a debug exception if a watchpoint has been hit.  */
1861 static void check_watchpoint(int offset, int len, int flags)
1862 {
1863     CPUState *cpu = current_cpu;
1864     CPUArchState *env = cpu->env_ptr;
1865     target_ulong pc, cs_base;
1866     target_ulong vaddr;
1867     CPUWatchpoint *wp;
1868     int cpu_flags;
1869
1870     if (cpu->watchpoint_hit) {
1871         /* We re-entered the check after replacing the TB. Now raise
1872          * the debug interrupt so that is will trigger after the
1873          * current instruction. */
1874         cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
1875         return;
1876     }
1877     vaddr = (cpu->mem_io_vaddr & TARGET_PAGE_MASK) + offset;
1878     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
1879         if (cpu_watchpoint_address_matches(wp, vaddr, len)
1880             && (wp->flags & flags)) {
1881             if (flags == BP_MEM_READ) {
1882                 wp->flags |= BP_WATCHPOINT_HIT_READ;
1883             } else {
1884                 wp->flags |= BP_WATCHPOINT_HIT_WRITE;
1885             }
1886             wp->hitaddr = vaddr;
1887             if (!cpu->watchpoint_hit) {
1888                 cpu->watchpoint_hit = wp;
1889                 tb_check_watchpoint(cpu);
1890                 if (wp->flags & BP_STOP_BEFORE_ACCESS) {
1891                     cpu->exception_index = EXCP_DEBUG;
1892                     cpu_loop_exit(cpu);
1893                 } else {
1894                     cpu_get_tb_cpu_state(env, &pc, &cs_base, &cpu_flags);
1895                     tb_gen_code(cpu, pc, cs_base, cpu_flags, 1);
1896                     cpu_resume_from_signal(cpu, NULL);
1897                 }
1898             }
1899         } else {
1900             wp->flags &= ~BP_WATCHPOINT_HIT;
1901         }
1902     }
1903 }
1904
1905 /* Watchpoint access routines.  Watchpoints are inserted using TLB tricks,
1906    so these check for a hit then pass through to the normal out-of-line
1907    phys routines.  */
1908 static uint64_t watch_mem_read(void *opaque, hwaddr addr,
1909                                unsigned size)
1910 {
1911     check_watchpoint(addr & ~TARGET_PAGE_MASK, size, BP_MEM_READ);
1912     switch (size) {
1913     case 1: return ldub_phys(&address_space_memory, addr);
1914     case 2: return lduw_phys(&address_space_memory, addr);
1915     case 4: return ldl_phys(&address_space_memory, addr);
1916     default: abort();
1917     }
1918 }
1919
1920 static void watch_mem_write(void *opaque, hwaddr addr,
1921                             uint64_t val, unsigned size)
1922 {
1923     check_watchpoint(addr & ~TARGET_PAGE_MASK, size, BP_MEM_WRITE);
1924     switch (size) {
1925     case 1:
1926         stb_phys(&address_space_memory, addr, val);
1927         break;
1928     case 2:
1929         stw_phys(&address_space_memory, addr, val);
1930         break;
1931     case 4:
1932         stl_phys(&address_space_memory, addr, val);
1933         break;
1934     default: abort();
1935     }
1936 }
1937
1938 static const MemoryRegionOps watch_mem_ops = {
1939     .read = watch_mem_read,
1940     .write = watch_mem_write,
1941     .endianness = DEVICE_NATIVE_ENDIAN,
1942 };
1943
1944 static uint64_t subpage_read(void *opaque, hwaddr addr,
1945                              unsigned len)
1946 {
1947     subpage_t *subpage = opaque;
1948     uint8_t buf[8];
1949
1950 #if defined(DEBUG_SUBPAGE)
1951     printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__,
1952            subpage, len, addr);
1953 #endif
1954     address_space_read(subpage->as, addr + subpage->base, buf, len);
1955     switch (len) {
1956     case 1:
1957         return ldub_p(buf);
1958     case 2:
1959         return lduw_p(buf);
1960     case 4:
1961         return ldl_p(buf);
1962     case 8:
1963         return ldq_p(buf);
1964     default:
1965         abort();
1966     }
1967 }
1968
1969 static void subpage_write(void *opaque, hwaddr addr,
1970                           uint64_t value, unsigned len)
1971 {
1972     subpage_t *subpage = opaque;
1973     uint8_t buf[8];
1974
1975 #if defined(DEBUG_SUBPAGE)
1976     printf("%s: subpage %p len %u addr " TARGET_FMT_plx
1977            " value %"PRIx64"\n",
1978            __func__, subpage, len, addr, value);
1979 #endif
1980     switch (len) {
1981     case 1:
1982         stb_p(buf, value);
1983         break;
1984     case 2:
1985         stw_p(buf, value);
1986         break;
1987     case 4:
1988         stl_p(buf, value);
1989         break;
1990     case 8:
1991         stq_p(buf, value);
1992         break;
1993     default:
1994         abort();
1995     }
1996     address_space_write(subpage->as, addr + subpage->base, buf, len);
1997 }
1998
1999 static bool subpage_accepts(void *opaque, hwaddr addr,
2000                             unsigned len, bool is_write)
2001 {
2002     subpage_t *subpage = opaque;
2003 #if defined(DEBUG_SUBPAGE)
2004     printf("%s: subpage %p %c len %u addr " TARGET_FMT_plx "\n",
2005            __func__, subpage, is_write ? 'w' : 'r', len, addr);
2006 #endif
2007
2008     return address_space_access_valid(subpage->as, addr + subpage->base,
2009                                       len, is_write);
2010 }
2011
2012 static const MemoryRegionOps subpage_ops = {
2013     .read = subpage_read,
2014     .write = subpage_write,
2015     .impl.min_access_size = 1,
2016     .impl.max_access_size = 8,
2017     .valid.min_access_size = 1,
2018     .valid.max_access_size = 8,
2019     .valid.accepts = subpage_accepts,
2020     .endianness = DEVICE_NATIVE_ENDIAN,
2021 };
2022
2023 static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
2024                              uint16_t section)
2025 {
2026     int idx, eidx;
2027
2028     if (start >= TARGET_PAGE_SIZE || end >= TARGET_PAGE_SIZE)
2029         return -1;
2030     idx = SUBPAGE_IDX(start);
2031     eidx = SUBPAGE_IDX(end);
2032 #if defined(DEBUG_SUBPAGE)
2033     printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n",
2034            __func__, mmio, start, end, idx, eidx, section);
2035 #endif
2036     for (; idx <= eidx; idx++) {
2037         mmio->sub_section[idx] = section;
2038     }
2039
2040     return 0;
2041 }
2042
2043 static subpage_t *subpage_init(AddressSpace *as, hwaddr base)
2044 {
2045     subpage_t *mmio;
2046
2047     mmio = g_malloc0(sizeof(subpage_t));
2048
2049     mmio->as = as;
2050     mmio->base = base;
2051     memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
2052                           NULL, TARGET_PAGE_SIZE);
2053     mmio->iomem.subpage = true;
2054 #if defined(DEBUG_SUBPAGE)
2055     printf("%s: %p base " TARGET_FMT_plx " len %08x\n", __func__,
2056            mmio, base, TARGET_PAGE_SIZE);
2057 #endif
2058     subpage_register(mmio, 0, TARGET_PAGE_SIZE-1, PHYS_SECTION_UNASSIGNED);
2059
2060     return mmio;
2061 }
2062
2063 static uint16_t dummy_section(PhysPageMap *map, AddressSpace *as,
2064                               MemoryRegion *mr)
2065 {
2066     assert(as);
2067     MemoryRegionSection section = {
2068         .address_space = as,
2069         .mr = mr,
2070         .offset_within_address_space = 0,
2071         .offset_within_region = 0,
2072         .size = int128_2_64(),
2073     };
2074
2075     return phys_section_add(map, &section);
2076 }
2077
2078 MemoryRegion *iotlb_to_region(CPUState *cpu, hwaddr index)
2079 {
2080     AddressSpaceDispatch *d = atomic_rcu_read(&cpu->memory_dispatch);
2081     MemoryRegionSection *sections = d->map.sections;
2082
2083     return sections[index & ~TARGET_PAGE_MASK].mr;
2084 }
2085
2086 static void io_mem_init(void)
2087 {
2088     memory_region_init_io(&io_mem_rom, NULL, &unassigned_mem_ops, NULL, NULL, UINT64_MAX);
2089     memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
2090                           NULL, UINT64_MAX);
2091     memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
2092                           NULL, UINT64_MAX);
2093     memory_region_init_io(&io_mem_watch, NULL, &watch_mem_ops, NULL,
2094                           NULL, UINT64_MAX);
2095 }
2096
2097 static void mem_begin(MemoryListener *listener)
2098 {
2099     AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
2100     AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
2101     uint16_t n;
2102
2103     n = dummy_section(&d->map, as, &io_mem_unassigned);
2104     assert(n == PHYS_SECTION_UNASSIGNED);
2105     n = dummy_section(&d->map, as, &io_mem_notdirty);
2106     assert(n == PHYS_SECTION_NOTDIRTY);
2107     n = dummy_section(&d->map, as, &io_mem_rom);
2108     assert(n == PHYS_SECTION_ROM);
2109     n = dummy_section(&d->map, as, &io_mem_watch);
2110     assert(n == PHYS_SECTION_WATCH);
2111
2112     d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
2113     d->as = as;
2114     as->next_dispatch = d;
2115 }
2116
2117 static void address_space_dispatch_free(AddressSpaceDispatch *d)
2118 {
2119     phys_sections_free(&d->map);
2120     g_free(d);
2121 }
2122
2123 static void mem_commit(MemoryListener *listener)
2124 {
2125     AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
2126     AddressSpaceDispatch *cur = as->dispatch;
2127     AddressSpaceDispatch *next = as->next_dispatch;
2128
2129     phys_page_compact_all(next, next->map.nodes_nb);
2130
2131     atomic_rcu_set(&as->dispatch, next);
2132     if (cur) {
2133         call_rcu(cur, address_space_dispatch_free, rcu);
2134     }
2135 }
2136
2137 static void tcg_commit(MemoryListener *listener)
2138 {
2139     CPUState *cpu;
2140
2141     /* since each CPU stores ram addresses in its TLB cache, we must
2142        reset the modified entries */
2143     /* XXX: slow ! */
2144     CPU_FOREACH(cpu) {
2145         /* FIXME: Disentangle the cpu.h circular files deps so we can
2146            directly get the right CPU from listener.  */
2147         if (cpu->tcg_as_listener != listener) {
2148             continue;
2149         }
2150         cpu_reload_memory_map(cpu);
2151     }
2152 }
2153
2154 static void core_log_global_start(MemoryListener *listener)
2155 {
2156     cpu_physical_memory_set_dirty_tracking(true);
2157 }
2158
2159 static void core_log_global_stop(MemoryListener *listener)
2160 {
2161     cpu_physical_memory_set_dirty_tracking(false);
2162 }
2163
2164 static MemoryListener core_memory_listener = {
2165     .log_global_start = core_log_global_start,
2166     .log_global_stop = core_log_global_stop,
2167     .priority = 1,
2168 };
2169
2170 void address_space_init_dispatch(AddressSpace *as)
2171 {
2172     as->dispatch = NULL;
2173     as->dispatch_listener = (MemoryListener) {
2174         .begin = mem_begin,
2175         .commit = mem_commit,
2176         .region_add = mem_add,
2177         .region_nop = mem_add,
2178         .priority = 0,
2179     };
2180     memory_listener_register(&as->dispatch_listener, as);
2181 }
2182
2183 void address_space_unregister(AddressSpace *as)
2184 {
2185     memory_listener_unregister(&as->dispatch_listener);
2186 }
2187
2188 void address_space_destroy_dispatch(AddressSpace *as)
2189 {
2190     AddressSpaceDispatch *d = as->dispatch;
2191
2192     atomic_rcu_set(&as->dispatch, NULL);
2193     if (d) {
2194         call_rcu(d, address_space_dispatch_free, rcu);
2195     }
2196 }
2197
2198 static void memory_map_init(void)
2199 {
2200     system_memory = g_malloc(sizeof(*system_memory));
2201
2202     memory_region_init(system_memory, NULL, "system", UINT64_MAX);
2203     address_space_init(&address_space_memory, system_memory, "memory");
2204
2205     system_io = g_malloc(sizeof(*system_io));
2206     memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
2207                           65536);
2208     address_space_init(&address_space_io, system_io, "I/O");
2209
2210     memory_listener_register(&core_memory_listener, &address_space_memory);
2211 }
2212
2213 MemoryRegion *get_system_memory(void)
2214 {
2215     return system_memory;
2216 }
2217
2218 MemoryRegion *get_system_io(void)
2219 {
2220     return system_io;
2221 }
2222
2223 #endif /* !defined(CONFIG_USER_ONLY) */
2224
2225 /* physical memory access (slow version, mainly for debug) */
2226 #if defined(CONFIG_USER_ONLY)
2227 int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
2228                         uint8_t *buf, int len, int is_write)
2229 {
2230     int l, flags;
2231     target_ulong page;
2232     void * p;
2233
2234     while (len > 0) {
2235         page = addr & TARGET_PAGE_MASK;
2236         l = (page + TARGET_PAGE_SIZE) - addr;
2237         if (l > len)
2238             l = len;
2239         flags = page_get_flags(page);
2240         if (!(flags & PAGE_VALID))
2241             return -1;
2242         if (is_write) {
2243             if (!(flags & PAGE_WRITE))
2244                 return -1;
2245             /* XXX: this code should not depend on lock_user */
2246             if (!(p = lock_user(VERIFY_WRITE, addr, l, 0)))
2247                 return -1;
2248             memcpy(p, buf, l);
2249             unlock_user(p, addr, l);
2250         } else {
2251             if (!(flags & PAGE_READ))
2252                 return -1;
2253             /* XXX: this code should not depend on lock_user */
2254             if (!(p = lock_user(VERIFY_READ, addr, l, 1)))
2255                 return -1;
2256             memcpy(buf, p, l);
2257             unlock_user(p, addr, 0);
2258         }
2259         len -= l;
2260         buf += l;
2261         addr += l;
2262     }
2263     return 0;
2264 }
2265
2266 #else
2267
2268 static void invalidate_and_set_dirty(hwaddr addr,
2269                                      hwaddr length)
2270 {
2271     if (cpu_physical_memory_range_includes_clean(addr, length)) {
2272         tb_invalidate_phys_range(addr, addr + length, 0);
2273         cpu_physical_memory_set_dirty_range_nocode(addr, length);
2274     }
2275     xen_modified_memory(addr, length);
2276 }
2277
2278 static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
2279 {
2280     unsigned access_size_max = mr->ops->valid.max_access_size;
2281
2282     /* Regions are assumed to support 1-4 byte accesses unless
2283        otherwise specified.  */
2284     if (access_size_max == 0) {
2285         access_size_max = 4;
2286     }
2287
2288     /* Bound the maximum access by the alignment of the address.  */
2289     if (!mr->ops->impl.unaligned) {
2290         unsigned align_size_max = addr & -addr;
2291         if (align_size_max != 0 && align_size_max < access_size_max) {
2292             access_size_max = align_size_max;
2293         }
2294     }
2295
2296     /* Don't attempt accesses larger than the maximum.  */
2297     if (l > access_size_max) {
2298         l = access_size_max;
2299     }
2300     if (l & (l - 1)) {
2301         l = 1 << (qemu_fls(l) - 1);
2302     }
2303
2304     return l;
2305 }
2306
2307 bool address_space_rw(AddressSpace *as, hwaddr addr, uint8_t *buf,
2308                       int len, bool is_write)
2309 {
2310     hwaddr l;
2311     uint8_t *ptr;
2312     uint64_t val;
2313     hwaddr addr1;
2314     MemoryRegion *mr;
2315     bool error = false;
2316
2317     while (len > 0) {
2318         l = len;
2319         mr = address_space_translate(as, addr, &addr1, &l, is_write);
2320
2321         if (is_write) {
2322             if (!memory_access_is_direct(mr, is_write)) {
2323                 l = memory_access_size(mr, l, addr1);
2324                 /* XXX: could force current_cpu to NULL to avoid
2325                    potential bugs */
2326                 switch (l) {
2327                 case 8:
2328                     /* 64 bit write access */
2329                     val = ldq_p(buf);
2330                     error |= io_mem_write(mr, addr1, val, 8);
2331                     break;
2332                 case 4:
2333                     /* 32 bit write access */
2334                     val = ldl_p(buf);
2335                     error |= io_mem_write(mr, addr1, val, 4);
2336                     break;
2337                 case 2:
2338                     /* 16 bit write access */
2339                     val = lduw_p(buf);
2340                     error |= io_mem_write(mr, addr1, val, 2);
2341                     break;
2342                 case 1:
2343                     /* 8 bit write access */
2344                     val = ldub_p(buf);
2345                     error |= io_mem_write(mr, addr1, val, 1);
2346                     break;
2347                 default:
2348                     abort();
2349                 }
2350             } else {
2351                 addr1 += memory_region_get_ram_addr(mr);
2352                 /* RAM case */
2353                 ptr = qemu_get_ram_ptr(addr1);
2354                 memcpy(ptr, buf, l);
2355                 invalidate_and_set_dirty(addr1, l);
2356             }
2357         } else {
2358             if (!memory_access_is_direct(mr, is_write)) {
2359                 /* I/O case */
2360                 l = memory_access_size(mr, l, addr1);
2361                 switch (l) {
2362                 case 8:
2363                     /* 64 bit read access */
2364                     error |= io_mem_read(mr, addr1, &val, 8);
2365                     stq_p(buf, val);
2366                     break;
2367                 case 4:
2368                     /* 32 bit read access */
2369                     error |= io_mem_read(mr, addr1, &val, 4);
2370                     stl_p(buf, val);
2371                     break;
2372                 case 2:
2373                     /* 16 bit read access */
2374                     error |= io_mem_read(mr, addr1, &val, 2);
2375                     stw_p(buf, val);
2376                     break;
2377                 case 1:
2378                     /* 8 bit read access */
2379                     error |= io_mem_read(mr, addr1, &val, 1);
2380                     stb_p(buf, val);
2381                     break;
2382                 default:
2383                     abort();
2384                 }
2385             } else {
2386                 /* RAM case */
2387                 ptr = qemu_get_ram_ptr(mr->ram_addr + addr1);
2388                 memcpy(buf, ptr, l);
2389             }
2390         }
2391         len -= l;
2392         buf += l;
2393         addr += l;
2394     }
2395
2396     return error;
2397 }
2398
2399 bool address_space_write(AddressSpace *as, hwaddr addr,
2400                          const uint8_t *buf, int len)
2401 {
2402     return address_space_rw(as, addr, (uint8_t *)buf, len, true);
2403 }
2404
2405 bool address_space_read(AddressSpace *as, hwaddr addr, uint8_t *buf, int len)
2406 {
2407     return address_space_rw(as, addr, buf, len, false);
2408 }
2409
2410
2411 void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
2412                             int len, int is_write)
2413 {
2414     address_space_rw(&address_space_memory, addr, buf, len, is_write);
2415 }
2416
2417 enum write_rom_type {
2418     WRITE_DATA,
2419     FLUSH_CACHE,
2420 };
2421
2422 static inline void cpu_physical_memory_write_rom_internal(AddressSpace *as,
2423     hwaddr addr, const uint8_t *buf, int len, enum write_rom_type type)
2424 {
2425     hwaddr l;
2426     uint8_t *ptr;
2427     hwaddr addr1;
2428     MemoryRegion *mr;
2429
2430     while (len > 0) {
2431         l = len;
2432         mr = address_space_translate(as, addr, &addr1, &l, true);
2433
2434         if (!(memory_region_is_ram(mr) ||
2435               memory_region_is_romd(mr))) {
2436             /* do nothing */
2437         } else {
2438             addr1 += memory_region_get_ram_addr(mr);
2439             /* ROM/RAM case */
2440             ptr = qemu_get_ram_ptr(addr1);
2441             switch (type) {
2442             case WRITE_DATA:
2443                 memcpy(ptr, buf, l);
2444                 invalidate_and_set_dirty(addr1, l);
2445                 break;
2446             case FLUSH_CACHE:
2447                 flush_icache_range((uintptr_t)ptr, (uintptr_t)ptr + l);
2448                 break;
2449             }
2450         }
2451         len -= l;
2452         buf += l;
2453         addr += l;
2454     }
2455 }
2456
2457 /* used for ROM loading : can write in RAM and ROM */
2458 void cpu_physical_memory_write_rom(AddressSpace *as, hwaddr addr,
2459                                    const uint8_t *buf, int len)
2460 {
2461     cpu_physical_memory_write_rom_internal(as, addr, buf, len, WRITE_DATA);
2462 }
2463
2464 void cpu_flush_icache_range(hwaddr start, int len)
2465 {
2466     /*
2467      * This function should do the same thing as an icache flush that was
2468      * triggered from within the guest. For TCG we are always cache coherent,
2469      * so there is no need to flush anything. For KVM / Xen we need to flush
2470      * the host's instruction cache at least.
2471      */
2472     if (tcg_enabled()) {
2473         return;
2474     }
2475
2476     cpu_physical_memory_write_rom_internal(&address_space_memory,
2477                                            start, NULL, len, FLUSH_CACHE);
2478 }
2479
2480 typedef struct {
2481     MemoryRegion *mr;
2482     void *buffer;
2483     hwaddr addr;
2484     hwaddr len;
2485 } BounceBuffer;
2486
2487 static BounceBuffer bounce;
2488
2489 typedef struct MapClient {
2490     void *opaque;
2491     void (*callback)(void *opaque);
2492     QLIST_ENTRY(MapClient) link;
2493 } MapClient;
2494
2495 static QLIST_HEAD(map_client_list, MapClient) map_client_list
2496     = QLIST_HEAD_INITIALIZER(map_client_list);
2497
2498 void *cpu_register_map_client(void *opaque, void (*callback)(void *opaque))
2499 {
2500     MapClient *client = g_malloc(sizeof(*client));
2501
2502     client->opaque = opaque;
2503     client->callback = callback;
2504     QLIST_INSERT_HEAD(&map_client_list, client, link);
2505     return client;
2506 }
2507
2508 static void cpu_unregister_map_client(void *_client)
2509 {
2510     MapClient *client = (MapClient *)_client;
2511
2512     QLIST_REMOVE(client, link);
2513     g_free(client);
2514 }
2515
2516 static void cpu_notify_map_clients(void)
2517 {
2518     MapClient *client;
2519
2520     while (!QLIST_EMPTY(&map_client_list)) {
2521         client = QLIST_FIRST(&map_client_list);
2522         client->callback(client->opaque);
2523         cpu_unregister_map_client(client);
2524     }
2525 }
2526
2527 bool address_space_access_valid(AddressSpace *as, hwaddr addr, int len, bool is_write)
2528 {
2529     MemoryRegion *mr;
2530     hwaddr l, xlat;
2531
2532     while (len > 0) {
2533         l = len;
2534         mr = address_space_translate(as, addr, &xlat, &l, is_write);
2535         if (!memory_access_is_direct(mr, is_write)) {
2536             l = memory_access_size(mr, l, addr);
2537             if (!memory_region_access_valid(mr, xlat, l, is_write)) {
2538                 return false;
2539             }
2540         }
2541
2542         len -= l;
2543         addr += l;
2544     }
2545     return true;
2546 }
2547
2548 /* Map a physical memory region into a host virtual address.
2549  * May map a subset of the requested range, given by and returned in *plen.
2550  * May return NULL if resources needed to perform the mapping are exhausted.
2551  * Use only for reads OR writes - not for read-modify-write operations.
2552  * Use cpu_register_map_client() to know when retrying the map operation is
2553  * likely to succeed.
2554  */
2555 void *address_space_map(AddressSpace *as,
2556                         hwaddr addr,
2557                         hwaddr *plen,
2558                         bool is_write)
2559 {
2560     hwaddr len = *plen;
2561     hwaddr done = 0;
2562     hwaddr l, xlat, base;
2563     MemoryRegion *mr, *this_mr;
2564     ram_addr_t raddr;
2565
2566     if (len == 0) {
2567         return NULL;
2568     }
2569
2570     l = len;
2571     mr = address_space_translate(as, addr, &xlat, &l, is_write);
2572     if (!memory_access_is_direct(mr, is_write)) {
2573         if (bounce.buffer) {
2574             return NULL;
2575         }
2576         /* Avoid unbounded allocations */
2577         l = MIN(l, TARGET_PAGE_SIZE);
2578         bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
2579         bounce.addr = addr;
2580         bounce.len = l;
2581
2582         memory_region_ref(mr);
2583         bounce.mr = mr;
2584         if (!is_write) {
2585             address_space_read(as, addr, bounce.buffer, l);
2586         }
2587
2588         *plen = l;
2589         return bounce.buffer;
2590     }
2591
2592     base = xlat;
2593     raddr = memory_region_get_ram_addr(mr);
2594
2595     for (;;) {
2596         len -= l;
2597         addr += l;
2598         done += l;
2599         if (len == 0) {
2600             break;
2601         }
2602
2603         l = len;
2604         this_mr = address_space_translate(as, addr, &xlat, &l, is_write);
2605         if (this_mr != mr || xlat != base + done) {
2606             break;
2607         }
2608     }
2609
2610     memory_region_ref(mr);
2611     *plen = done;
2612     return qemu_ram_ptr_length(raddr + base, plen);
2613 }
2614
2615 /* Unmaps a memory region previously mapped by address_space_map().
2616  * Will also mark the memory as dirty if is_write == 1.  access_len gives
2617  * the amount of memory that was actually read or written by the caller.
2618  */
2619 void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
2620                          int is_write, hwaddr access_len)
2621 {
2622     if (buffer != bounce.buffer) {
2623         MemoryRegion *mr;
2624         ram_addr_t addr1;
2625
2626         mr = qemu_ram_addr_from_host(buffer, &addr1);
2627         assert(mr != NULL);
2628         if (is_write) {
2629             invalidate_and_set_dirty(addr1, access_len);
2630         }
2631         if (xen_enabled()) {
2632             xen_invalidate_map_cache_entry(buffer);
2633         }
2634         memory_region_unref(mr);
2635         return;
2636     }
2637     if (is_write) {
2638         address_space_write(as, bounce.addr, bounce.buffer, access_len);
2639     }
2640     qemu_vfree(bounce.buffer);
2641     bounce.buffer = NULL;
2642     memory_region_unref(bounce.mr);
2643     cpu_notify_map_clients();
2644 }
2645
2646 void *cpu_physical_memory_map(hwaddr addr,
2647                               hwaddr *plen,
2648                               int is_write)
2649 {
2650     return address_space_map(&address_space_memory, addr, plen, is_write);
2651 }
2652
2653 void cpu_physical_memory_unmap(void *buffer, hwaddr len,
2654                                int is_write, hwaddr access_len)
2655 {
2656     return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
2657 }
2658
2659 /* warning: addr must be aligned */
2660 static inline uint32_t ldl_phys_internal(AddressSpace *as, hwaddr addr,
2661                                          enum device_endian endian)
2662 {
2663     uint8_t *ptr;
2664     uint64_t val;
2665     MemoryRegion *mr;
2666     hwaddr l = 4;
2667     hwaddr addr1;
2668
2669     mr = address_space_translate(as, addr, &addr1, &l, false);
2670     if (l < 4 || !memory_access_is_direct(mr, false)) {
2671         /* I/O case */
2672         io_mem_read(mr, addr1, &val, 4);
2673 #if defined(TARGET_WORDS_BIGENDIAN)
2674         if (endian == DEVICE_LITTLE_ENDIAN) {
2675             val = bswap32(val);
2676         }
2677 #else
2678         if (endian == DEVICE_BIG_ENDIAN) {
2679             val = bswap32(val);
2680         }
2681 #endif
2682     } else {
2683         /* RAM case */
2684         ptr = qemu_get_ram_ptr((memory_region_get_ram_addr(mr)
2685                                 & TARGET_PAGE_MASK)
2686                                + addr1);
2687         switch (endian) {
2688         case DEVICE_LITTLE_ENDIAN:
2689             val = ldl_le_p(ptr);
2690             break;
2691         case DEVICE_BIG_ENDIAN:
2692             val = ldl_be_p(ptr);
2693             break;
2694         default:
2695             val = ldl_p(ptr);
2696             break;
2697         }
2698     }
2699     return val;
2700 }
2701
2702 uint32_t ldl_phys(AddressSpace *as, hwaddr addr)
2703 {
2704     return ldl_phys_internal(as, addr, DEVICE_NATIVE_ENDIAN);
2705 }
2706
2707 uint32_t ldl_le_phys(AddressSpace *as, hwaddr addr)
2708 {
2709     return ldl_phys_internal(as, addr, DEVICE_LITTLE_ENDIAN);
2710 }
2711
2712 uint32_t ldl_be_phys(AddressSpace *as, hwaddr addr)
2713 {
2714     return ldl_phys_internal(as, addr, DEVICE_BIG_ENDIAN);
2715 }
2716
2717 /* warning: addr must be aligned */
2718 static inline uint64_t ldq_phys_internal(AddressSpace *as, hwaddr addr,
2719                                          enum device_endian endian)
2720 {
2721     uint8_t *ptr;
2722     uint64_t val;
2723     MemoryRegion *mr;
2724     hwaddr l = 8;
2725     hwaddr addr1;
2726
2727     mr = address_space_translate(as, addr, &addr1, &l,
2728                                  false);
2729     if (l < 8 || !memory_access_is_direct(mr, false)) {
2730         /* I/O case */
2731         io_mem_read(mr, addr1, &val, 8);
2732 #if defined(TARGET_WORDS_BIGENDIAN)
2733         if (endian == DEVICE_LITTLE_ENDIAN) {
2734             val = bswap64(val);
2735         }
2736 #else
2737         if (endian == DEVICE_BIG_ENDIAN) {
2738             val = bswap64(val);
2739         }
2740 #endif
2741     } else {
2742         /* RAM case */
2743         ptr = qemu_get_ram_ptr((memory_region_get_ram_addr(mr)
2744                                 & TARGET_PAGE_MASK)
2745                                + addr1);
2746         switch (endian) {
2747         case DEVICE_LITTLE_ENDIAN:
2748             val = ldq_le_p(ptr);
2749             break;
2750         case DEVICE_BIG_ENDIAN:
2751             val = ldq_be_p(ptr);
2752             break;
2753         default:
2754             val = ldq_p(ptr);
2755             break;
2756         }
2757     }
2758     return val;
2759 }
2760
2761 uint64_t ldq_phys(AddressSpace *as, hwaddr addr)
2762 {
2763     return ldq_phys_internal(as, addr, DEVICE_NATIVE_ENDIAN);
2764 }
2765
2766 uint64_t ldq_le_phys(AddressSpace *as, hwaddr addr)
2767 {
2768     return ldq_phys_internal(as, addr, DEVICE_LITTLE_ENDIAN);
2769 }
2770
2771 uint64_t ldq_be_phys(AddressSpace *as, hwaddr addr)
2772 {
2773     return ldq_phys_internal(as, addr, DEVICE_BIG_ENDIAN);
2774 }
2775
2776 /* XXX: optimize */
2777 uint32_t ldub_phys(AddressSpace *as, hwaddr addr)
2778 {
2779     uint8_t val;
2780     address_space_rw(as, addr, &val, 1, 0);
2781     return val;
2782 }
2783
2784 /* warning: addr must be aligned */
2785 static inline uint32_t lduw_phys_internal(AddressSpace *as, hwaddr addr,
2786                                           enum device_endian endian)
2787 {
2788     uint8_t *ptr;
2789     uint64_t val;
2790     MemoryRegion *mr;
2791     hwaddr l = 2;
2792     hwaddr addr1;
2793
2794     mr = address_space_translate(as, addr, &addr1, &l,
2795                                  false);
2796     if (l < 2 || !memory_access_is_direct(mr, false)) {
2797         /* I/O case */
2798         io_mem_read(mr, addr1, &val, 2);
2799 #if defined(TARGET_WORDS_BIGENDIAN)
2800         if (endian == DEVICE_LITTLE_ENDIAN) {
2801             val = bswap16(val);
2802         }
2803 #else
2804         if (endian == DEVICE_BIG_ENDIAN) {
2805             val = bswap16(val);
2806         }
2807 #endif
2808     } else {
2809         /* RAM case */
2810         ptr = qemu_get_ram_ptr((memory_region_get_ram_addr(mr)
2811                                 & TARGET_PAGE_MASK)
2812                                + addr1);
2813         switch (endian) {
2814         case DEVICE_LITTLE_ENDIAN:
2815             val = lduw_le_p(ptr);
2816             break;
2817         case DEVICE_BIG_ENDIAN:
2818             val = lduw_be_p(ptr);
2819             break;
2820         default:
2821             val = lduw_p(ptr);
2822             break;
2823         }
2824     }
2825     return val;
2826 }
2827
2828 uint32_t lduw_phys(AddressSpace *as, hwaddr addr)
2829 {
2830     return lduw_phys_internal(as, addr, DEVICE_NATIVE_ENDIAN);
2831 }
2832
2833 uint32_t lduw_le_phys(AddressSpace *as, hwaddr addr)
2834 {
2835     return lduw_phys_internal(as, addr, DEVICE_LITTLE_ENDIAN);
2836 }
2837
2838 uint32_t lduw_be_phys(AddressSpace *as, hwaddr addr)
2839 {
2840     return lduw_phys_internal(as, addr, DEVICE_BIG_ENDIAN);
2841 }
2842
2843 /* warning: addr must be aligned. The ram page is not masked as dirty
2844    and the code inside is not invalidated. It is useful if the dirty
2845    bits are used to track modified PTEs */
2846 void stl_phys_notdirty(AddressSpace *as, hwaddr addr, uint32_t val)
2847 {
2848     uint8_t *ptr;
2849     MemoryRegion *mr;
2850     hwaddr l = 4;
2851     hwaddr addr1;
2852
2853     mr = address_space_translate(as, addr, &addr1, &l,
2854                                  true);
2855     if (l < 4 || !memory_access_is_direct(mr, true)) {
2856         io_mem_write(mr, addr1, val, 4);
2857     } else {
2858         addr1 += memory_region_get_ram_addr(mr) & TARGET_PAGE_MASK;
2859         ptr = qemu_get_ram_ptr(addr1);
2860         stl_p(ptr, val);
2861
2862         if (unlikely(in_migration)) {
2863             if (cpu_physical_memory_is_clean(addr1)) {
2864                 /* invalidate code */
2865                 tb_invalidate_phys_page_range(addr1, addr1 + 4, 0);
2866                 /* set dirty bit */
2867                 cpu_physical_memory_set_dirty_range_nocode(addr1, 4);
2868             }
2869         }
2870     }
2871 }
2872
2873 /* warning: addr must be aligned */
2874 static inline void stl_phys_internal(AddressSpace *as,
2875                                      hwaddr addr, uint32_t val,
2876                                      enum device_endian endian)
2877 {
2878     uint8_t *ptr;
2879     MemoryRegion *mr;
2880     hwaddr l = 4;
2881     hwaddr addr1;
2882
2883     mr = address_space_translate(as, addr, &addr1, &l,
2884                                  true);
2885     if (l < 4 || !memory_access_is_direct(mr, true)) {
2886 #if defined(TARGET_WORDS_BIGENDIAN)
2887         if (endian == DEVICE_LITTLE_ENDIAN) {
2888             val = bswap32(val);
2889         }
2890 #else
2891         if (endian == DEVICE_BIG_ENDIAN) {
2892             val = bswap32(val);
2893         }
2894 #endif
2895         io_mem_write(mr, addr1, val, 4);
2896     } else {
2897         /* RAM case */
2898         addr1 += memory_region_get_ram_addr(mr) & TARGET_PAGE_MASK;
2899         ptr = qemu_get_ram_ptr(addr1);
2900         switch (endian) {
2901         case DEVICE_LITTLE_ENDIAN:
2902             stl_le_p(ptr, val);
2903             break;
2904         case DEVICE_BIG_ENDIAN:
2905             stl_be_p(ptr, val);
2906             break;
2907         default:
2908             stl_p(ptr, val);
2909             break;
2910         }
2911         invalidate_and_set_dirty(addr1, 4);
2912     }
2913 }
2914
2915 void stl_phys(AddressSpace *as, hwaddr addr, uint32_t val)
2916 {
2917     stl_phys_internal(as, addr, val, DEVICE_NATIVE_ENDIAN);
2918 }
2919
2920 void stl_le_phys(AddressSpace *as, hwaddr addr, uint32_t val)
2921 {
2922     stl_phys_internal(as, addr, val, DEVICE_LITTLE_ENDIAN);
2923 }
2924
2925 void stl_be_phys(AddressSpace *as, hwaddr addr, uint32_t val)
2926 {
2927     stl_phys_internal(as, addr, val, DEVICE_BIG_ENDIAN);
2928 }
2929
2930 /* XXX: optimize */
2931 void stb_phys(AddressSpace *as, hwaddr addr, uint32_t val)
2932 {
2933     uint8_t v = val;
2934     address_space_rw(as, addr, &v, 1, 1);
2935 }
2936
2937 /* warning: addr must be aligned */
2938 static inline void stw_phys_internal(AddressSpace *as,
2939                                      hwaddr addr, uint32_t val,
2940                                      enum device_endian endian)
2941 {
2942     uint8_t *ptr;
2943     MemoryRegion *mr;
2944     hwaddr l = 2;
2945     hwaddr addr1;
2946
2947     mr = address_space_translate(as, addr, &addr1, &l, true);
2948     if (l < 2 || !memory_access_is_direct(mr, true)) {
2949 #if defined(TARGET_WORDS_BIGENDIAN)
2950         if (endian == DEVICE_LITTLE_ENDIAN) {
2951             val = bswap16(val);
2952         }
2953 #else
2954         if (endian == DEVICE_BIG_ENDIAN) {
2955             val = bswap16(val);
2956         }
2957 #endif
2958         io_mem_write(mr, addr1, val, 2);
2959     } else {
2960         /* RAM case */
2961         addr1 += memory_region_get_ram_addr(mr) & TARGET_PAGE_MASK;
2962         ptr = qemu_get_ram_ptr(addr1);
2963         switch (endian) {
2964         case DEVICE_LITTLE_ENDIAN:
2965             stw_le_p(ptr, val);
2966             break;
2967         case DEVICE_BIG_ENDIAN:
2968             stw_be_p(ptr, val);
2969             break;
2970         default:
2971             stw_p(ptr, val);
2972             break;
2973         }
2974         invalidate_and_set_dirty(addr1, 2);
2975     }
2976 }
2977
2978 void stw_phys(AddressSpace *as, hwaddr addr, uint32_t val)
2979 {
2980     stw_phys_internal(as, addr, val, DEVICE_NATIVE_ENDIAN);
2981 }
2982
2983 void stw_le_phys(AddressSpace *as, hwaddr addr, uint32_t val)
2984 {
2985     stw_phys_internal(as, addr, val, DEVICE_LITTLE_ENDIAN);
2986 }
2987
2988 void stw_be_phys(AddressSpace *as, hwaddr addr, uint32_t val)
2989 {
2990     stw_phys_internal(as, addr, val, DEVICE_BIG_ENDIAN);
2991 }
2992
2993 /* XXX: optimize */
2994 void stq_phys(AddressSpace *as, hwaddr addr, uint64_t val)
2995 {
2996     val = tswap64(val);
2997     address_space_rw(as, addr, (void *) &val, 8, 1);
2998 }
2999
3000 void stq_le_phys(AddressSpace *as, hwaddr addr, uint64_t val)
3001 {
3002     val = cpu_to_le64(val);
3003     address_space_rw(as, addr, (void *) &val, 8, 1);
3004 }
3005
3006 void stq_be_phys(AddressSpace *as, hwaddr addr, uint64_t val)
3007 {
3008     val = cpu_to_be64(val);
3009     address_space_rw(as, addr, (void *) &val, 8, 1);
3010 }
3011
3012 /* virtual memory access for debug (includes writing to ROM) */
3013 int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
3014                         uint8_t *buf, int len, int is_write)
3015 {
3016     int l;
3017     hwaddr phys_addr;
3018     target_ulong page;
3019
3020     while (len > 0) {
3021         page = addr & TARGET_PAGE_MASK;
3022         phys_addr = cpu_get_phys_page_debug(cpu, page);
3023         /* if no physical page mapped, return an error */
3024         if (phys_addr == -1)
3025             return -1;
3026         l = (page + TARGET_PAGE_SIZE) - addr;
3027         if (l > len)
3028             l = len;
3029         phys_addr += (addr & ~TARGET_PAGE_MASK);
3030         if (is_write) {
3031             cpu_physical_memory_write_rom(cpu->as, phys_addr, buf, l);
3032         } else {
3033             address_space_rw(cpu->as, phys_addr, buf, l, 0);
3034         }
3035         len -= l;
3036         buf += l;
3037         addr += l;
3038     }
3039     return 0;
3040 }
3041 #endif
3042
3043 /*
3044  * A helper function for the _utterly broken_ virtio device model to find out if
3045  * it's running on a big endian machine. Don't do this at home kids!
3046  */
3047 bool target_words_bigendian(void);
3048 bool target_words_bigendian(void)
3049 {
3050 #if defined(TARGET_WORDS_BIGENDIAN)
3051     return true;
3052 #else
3053     return false;
3054 #endif
3055 }
3056
3057 #ifndef CONFIG_USER_ONLY
3058 bool cpu_physical_memory_is_io(hwaddr phys_addr)
3059 {
3060     MemoryRegion*mr;
3061     hwaddr l = 1;
3062
3063     mr = address_space_translate(&address_space_memory,
3064                                  phys_addr, &phys_addr, &l, false);
3065
3066     return !(memory_region_is_ram(mr) ||
3067              memory_region_is_romd(mr));
3068 }
3069
3070 void qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
3071 {
3072     RAMBlock *block;
3073
3074     rcu_read_lock();
3075     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
3076         func(block->host, block->offset, block->used_length, opaque);
3077     }
3078     rcu_read_unlock();
3079 }
3080 #endif