exec.c

   1 /*
   2  *  Virtual page mapping
   3  *
   4  *  Copyright (c) 2003 Fabrice Bellard
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19 #include "config.h"
  20 #ifndef _WIN32
  21 #include <sys/types.h>
  22 #include <sys/mman.h>
  23 #endif
  24
  25 #include "qemu-common.h"
  26 #include "cpu.h"
  27 #include "tcg.h"
  28 #include "hw/hw.h"
  29 #if !defined(CONFIG_USER_ONLY)
  30 #include "hw/boards.h"
  31 #endif
  32 #include "hw/qdev.h"
  33 #include "qemu/osdep.h"
  34 #include "sysemu/kvm.h"
  35 #include "sysemu/sysemu.h"
  36 #include "hw/xen/xen.h"
  37 #include "qemu/timer.h"
  38 #include "qemu/config-file.h"
  39 #include "qemu/error-report.h"
  40 #include "exec/memory.h"
  41 #include "sysemu/dma.h"
  42 #include "exec/address-spaces.h"
  43 #if defined(CONFIG_USER_ONLY)
  44 #include <qemu.h>
  45 #else /* !CONFIG_USER_ONLY */
  46 #include "sysemu/xen-mapcache.h"
  47 #include "trace.h"
  48 #endif
  49 #include "exec/cpu-all.h"
  50 #include "qemu/rcu_queue.h"
  51 #include "exec/cputlb.h"
  52 #include "translate-all.h"
  53
  54 #include "exec/memory-internal.h"
  55 #include "exec/ram_addr.h"
  56
  57 #include "qemu/range.h"
  58
  59 //#define DEBUG_SUBPAGE
  60
  61 #if !defined(CONFIG_USER_ONLY)
  62 static bool in_migration;
  63
  64 /* ram_list is read under rcu_read_lock()/rcu_read_unlock().  Writes
  65  * are protected by the ramlist lock.
  66  */
  67 RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
  68
  69 static MemoryRegion *system_memory;
  70 static MemoryRegion *system_io;
  71
  72 AddressSpace address_space_io;
  73 AddressSpace address_space_memory;
  74
  75 MemoryRegion io_mem_rom, io_mem_notdirty;
  76 static MemoryRegion io_mem_unassigned;
  77
  78 /* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */
  79 #define RAM_PREALLOC   (1 << 0)
  80
  81 /* RAM is mmap-ed with MAP_SHARED */
  82 #define RAM_SHARED     (1 << 1)
  83
  84 /* Only a portion of RAM (used_length) is actually used, and migrated.
  85  * This used_length size can change across reboots.
  86  */
  87 #define RAM_RESIZEABLE (1 << 2)
  88
  89 #endif
  90
  91 struct CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
  92 /* current CPU in the current thread. It is only valid inside
  93    cpu_exec() */
  94 DEFINE_TLS(CPUState *, current_cpu);
  95 /* 0 = Do not count executed instructions.
  96    1 = Precise instruction counting.
  97    2 = Adaptive rate instruction counting.  */
  98 int use_icount;
  99
 100 #if !defined(CONFIG_USER_ONLY)
 101
 102 typedef struct PhysPageEntry PhysPageEntry;
 103
 104 struct PhysPageEntry {
 105     /* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
 106     uint32_t skip : 6;
 107      /* index into phys_sections (!skip) or phys_map_nodes (skip) */
 108     uint32_t ptr : 26;
 109 };
 110
 111 #define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)
 112
 113 /* Size of the L2 (and L3, etc) page tables.  */
 114 #define ADDR_SPACE_BITS 64
 115
 116 #define P_L2_BITS 9
 117 #define P_L2_SIZE (1 << P_L2_BITS)
 118
 119 #define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
 120
 121 typedef PhysPageEntry Node[P_L2_SIZE];
 122
 123 typedef struct PhysPageMap {
 124     struct rcu_head rcu;
 125
 126     unsigned sections_nb;
 127     unsigned sections_nb_alloc;
 128     unsigned nodes_nb;
 129     unsigned nodes_nb_alloc;
 130     Node *nodes;
 131     MemoryRegionSection *sections;
 132 } PhysPageMap;
 133
 134 struct AddressSpaceDispatch {
 135     struct rcu_head rcu;
 136
 137     /* This is a multi-level map on the physical address space.
 138      * The bottom level has pointers to MemoryRegionSections.
 139      */
 140     PhysPageEntry phys_map;
 141     PhysPageMap map;
 142     AddressSpace *as;
 143 };
 144
 145 #define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
 146 typedef struct subpage_t {
 147     MemoryRegion iomem;
 148     AddressSpace *as;
 149     hwaddr base;
 150     uint16_t sub_section[TARGET_PAGE_SIZE];
 151 } subpage_t;
 152
 153 #define PHYS_SECTION_UNASSIGNED 0
 154 #define PHYS_SECTION_NOTDIRTY 1
 155 #define PHYS_SECTION_ROM 2
 156 #define PHYS_SECTION_WATCH 3
 157
 158 static void io_mem_init(void);
 159 static void memory_map_init(void);
 160 static void tcg_commit(MemoryListener *listener);
 161
 162 static MemoryRegion io_mem_watch;
 163 #endif
 164
 165 #if !defined(CONFIG_USER_ONLY)
 166
 167 static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
 168 {
 169     if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
 170         map->nodes_nb_alloc = MAX(map->nodes_nb_alloc * 2, 16);
 171         map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, map->nodes_nb + nodes);
 172         map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
 173     }
 174 }
 175
 176 static uint32_t phys_map_node_alloc(PhysPageMap *map)
 177 {
 178     unsigned i;
 179     uint32_t ret;
 180
 181     ret = map->nodes_nb++;
 182     assert(ret != PHYS_MAP_NODE_NIL);
 183     assert(ret != map->nodes_nb_alloc);
 184     for (i = 0; i < P_L2_SIZE; ++i) {
 185         map->nodes[ret][i].skip = 1;
 186         map->nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
 187     }
 188     return ret;
 189 }
 190
 191 static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp,
 192                                 hwaddr *index, hwaddr *nb, uint16_t leaf,
 193                                 int level)
 194 {
 195     PhysPageEntry *p;
 196     int i;
 197     hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
 198
 199     if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
 200         lp->ptr = phys_map_node_alloc(map);
 201         p = map->nodes[lp->ptr];
 202         if (level == 0) {
 203             for (i = 0; i < P_L2_SIZE; i++) {
 204                 p[i].skip = 0;
 205                 p[i].ptr = PHYS_SECTION_UNASSIGNED;
 206             }
 207         }
 208     } else {
 209         p = map->nodes[lp->ptr];
 210     }
 211     lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)];
 212
 213     while (*nb && lp < &p[P_L2_SIZE]) {
 214         if ((*index & (step - 1)) == 0 && *nb >= step) {
 215             lp->skip = 0;
 216             lp->ptr = leaf;
 217             *index += step;
 218             *nb -= step;
 219         } else {
 220             phys_page_set_level(map, lp, index, nb, leaf, level - 1);
 221         }
 222         ++lp;
 223     }
 224 }
 225
 226 static void phys_page_set(AddressSpaceDispatch *d,
 227                           hwaddr index, hwaddr nb,
 228                           uint16_t leaf)
 229 {
 230     /* Wildly overreserve - it doesn't matter much. */
 231     phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
 232
 233     phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
 234 }
 235
 236 /* Compact a non leaf page entry. Simply detect that the entry has a single child,
 237  * and update our entry so we can skip it and go directly to the destination.
 238  */
 239 static void phys_page_compact(PhysPageEntry *lp, Node *nodes, unsigned long *compacted)
 240 {
 241     unsigned valid_ptr = P_L2_SIZE;
 242     int valid = 0;
 243     PhysPageEntry *p;
 244     int i;
 245
 246     if (lp->ptr == PHYS_MAP_NODE_NIL) {
 247         return;
 248     }
 249
 250     p = nodes[lp->ptr];
 251     for (i = 0; i < P_L2_SIZE; i++) {
 252         if (p[i].ptr == PHYS_MAP_NODE_NIL) {
 253             continue;
 254         }
 255
 256         valid_ptr = i;
 257         valid++;
 258         if (p[i].skip) {
 259             phys_page_compact(&p[i], nodes, compacted);
 260         }
 261     }
 262
 263     /* We can only compress if there's only one child. */
 264     if (valid != 1) {
 265         return;
 266     }
 267
 268     assert(valid_ptr < P_L2_SIZE);
 269
 270     /* Don't compress if it won't fit in the # of bits we have. */
 271     if (lp->skip + p[valid_ptr].skip >= (1 << 3)) {
 272         return;
 273     }
 274
 275     lp->ptr = p[valid_ptr].ptr;
 276     if (!p[valid_ptr].skip) {
 277         /* If our only child is a leaf, make this a leaf. */
 278         /* By design, we should have made this node a leaf to begin with so we
 279          * should never reach here.
 280          * But since it's so simple to handle this, let's do it just in case we
 281          * change this rule.
 282          */
 283         lp->skip = 0;
 284     } else {
 285         lp->skip += p[valid_ptr].skip;
 286     }
 287 }
 288
 289 static void phys_page_compact_all(AddressSpaceDispatch *d, int nodes_nb)
 290 {
 291     DECLARE_BITMAP(compacted, nodes_nb);
 292
 293     if (d->phys_map.skip) {
 294         phys_page_compact(&d->phys_map, d->map.nodes, compacted);
 295     }
 296 }
 297
 298 static MemoryRegionSection *phys_page_find(PhysPageEntry lp, hwaddr addr,
 299                                            Node *nodes, MemoryRegionSection *sections)
 300 {
 301     PhysPageEntry *p;
 302     hwaddr index = addr >> TARGET_PAGE_BITS;
 303     int i;
 304
 305     for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
 306         if (lp.ptr == PHYS_MAP_NODE_NIL) {
 307             return &sections[PHYS_SECTION_UNASSIGNED];
 308         }
 309         p = nodes[lp.ptr];
 310         lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
 311     }
 312
 313     if (sections[lp.ptr].size.hi ||
 314         range_covers_byte(sections[lp.ptr].offset_within_address_space,
 315                           sections[lp.ptr].size.lo, addr)) {
 316         return &sections[lp.ptr];
 317     } else {
 318         return &sections[PHYS_SECTION_UNASSIGNED];
 319     }
 320 }
 321
 322 bool memory_region_is_unassigned(MemoryRegion *mr)
 323 {
 324     return mr != &io_mem_rom && mr != &io_mem_notdirty && !mr->rom_device
 325         && mr != &io_mem_watch;
 326 }
 327
 328 /* Called from RCU critical section */
 329 static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
 330                                                         hwaddr addr,
 331                                                         bool resolve_subpage)
 332 {
 333     MemoryRegionSection *section;
 334     subpage_t *subpage;
 335
 336     section = phys_page_find(d->phys_map, addr, d->map.nodes, d->map.sections);
 337     if (resolve_subpage && section->mr->subpage) {
 338         subpage = container_of(section->mr, subpage_t, iomem);
 339         section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
 340     }
 341     return section;
 342 }
 343
 344 /* Called from RCU critical section */
 345 static MemoryRegionSection *
 346 address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *xlat,
 347                                  hwaddr *plen, bool resolve_subpage)
 348 {
 349     MemoryRegionSection *section;
 350     Int128 diff;
 351
 352     section = address_space_lookup_region(d, addr, resolve_subpage);
 353     /* Compute offset within MemoryRegionSection */
 354     addr -= section->offset_within_address_space;
 355
 356     /* Compute offset within MemoryRegion */
 357     *xlat = addr + section->offset_within_region;
 358
 359     diff = int128_sub(section->mr->size, int128_make64(addr));
 360     *plen = int128_get64(int128_min(diff, int128_make64(*plen)));
 361     return section;
 362 }
 363
 364 static inline bool memory_access_is_direct(MemoryRegion *mr, bool is_write)
 365 {
 366     if (memory_region_is_ram(mr)) {
 367         return !(is_write && mr->readonly);
 368     }
 369     if (memory_region_is_romd(mr)) {
 370         return !is_write;
 371     }
 372
 373     return false;
 374 }
 375
 376 MemoryRegion *address_space_translate(AddressSpace *as, hwaddr addr,
 377                                       hwaddr *xlat, hwaddr *plen,
 378                                       bool is_write)
 379 {
 380     IOMMUTLBEntry iotlb;
 381     MemoryRegionSection *section;
 382     MemoryRegion *mr;
 383     hwaddr len = *plen;
 384
 385     rcu_read_lock();
 386     for (;;) {
 387         AddressSpaceDispatch *d = atomic_rcu_read(&as->dispatch);
 388         section = address_space_translate_internal(d, addr, &addr, plen, true);
 389         mr = section->mr;
 390
 391         if (!mr->iommu_ops) {
 392             break;
 393         }
 394
 395         iotlb = mr->iommu_ops->translate(mr, addr, is_write);
 396         addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
 397                 | (addr & iotlb.addr_mask));
 398         len = MIN(len, (addr | iotlb.addr_mask) - addr + 1);
 399         if (!(iotlb.perm & (1 << is_write))) {
 400             mr = &io_mem_unassigned;
 401             break;
 402         }
 403
 404         as = iotlb.target_as;
 405     }
 406
 407     if (xen_enabled() && memory_access_is_direct(mr, is_write)) {
 408         hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
 409         len = MIN(page, len);
 410     }
 411
 412     *plen = len;
 413     *xlat = addr;
 414     rcu_read_unlock();
 415     return mr;
 416 }
 417
 418 /* Called from RCU critical section */
 419 MemoryRegionSection *
 420 address_space_translate_for_iotlb(CPUState *cpu, hwaddr addr,
 421                                   hwaddr *xlat, hwaddr *plen)
 422 {
 423     MemoryRegionSection *section;
 424     section = address_space_translate_internal(cpu->memory_dispatch,
 425                                                addr, xlat, plen, false);
 426
 427     assert(!section->mr->iommu_ops);
 428     return section;
 429 }
 430 #endif
 431
 432 void cpu_exec_init_all(void)
 433 {
 434 #if !defined(CONFIG_USER_ONLY)
 435     qemu_mutex_init(&ram_list.mutex);
 436     memory_map_init();
 437     io_mem_init();
 438 #endif
 439 }
 440
 441 #if !defined(CONFIG_USER_ONLY)
 442
 443 static int cpu_common_post_load(void *opaque, int version_id)
 444 {
 445     CPUState *cpu = opaque;
 446
 447     /* 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the
 448        version_id is increased. */
 449     cpu->interrupt_request &= ~0x01;
 450     tlb_flush(cpu, 1);
 451
 452     return 0;
 453 }
 454
 455 static int cpu_common_pre_load(void *opaque)
 456 {
 457     CPUState *cpu = opaque;
 458
 459     cpu->exception_index = -1;
 460
 461     return 0;
 462 }
 463
 464 static bool cpu_common_exception_index_needed(void *opaque)
 465 {
 466     CPUState *cpu = opaque;
 467
 468     return tcg_enabled() && cpu->exception_index != -1;
 469 }
 470
 471 static const VMStateDescription vmstate_cpu_common_exception_index = {
 472     .name = "cpu_common/exception_index",
 473     .version_id = 1,
 474     .minimum_version_id = 1,
 475     .fields = (VMStateField[]) {
 476         VMSTATE_INT32(exception_index, CPUState),
 477         VMSTATE_END_OF_LIST()
 478     }
 479 };
 480
 481 const VMStateDescription vmstate_cpu_common = {
 482     .name = "cpu_common",
 483     .version_id = 1,
 484     .minimum_version_id = 1,
 485     .pre_load = cpu_common_pre_load,
 486     .post_load = cpu_common_post_load,
 487     .fields = (VMStateField[]) {
 488         VMSTATE_UINT32(halted, CPUState),
 489         VMSTATE_UINT32(interrupt_request, CPUState),
 490         VMSTATE_END_OF_LIST()
 491     },
 492     .subsections = (VMStateSubsection[]) {
 493         {
 494             .vmsd = &vmstate_cpu_common_exception_index,
 495             .needed = cpu_common_exception_index_needed,
 496         } , {
 497             /* empty */
 498         }
 499     }
 500 };
 501
 502 #endif
 503
 504 CPUState *qemu_get_cpu(int index)
 505 {
 506     CPUState *cpu;
 507
 508     CPU_FOREACH(cpu) {
 509         if (cpu->cpu_index == index) {
 510             return cpu;
 511         }
 512     }
 513
 514     return NULL;
 515 }
 516
 517 #if !defined(CONFIG_USER_ONLY)
 518 void tcg_cpu_address_space_init(CPUState *cpu, AddressSpace *as)
 519 {
 520     /* We only support one address space per cpu at the moment.  */
 521     assert(cpu->as == as);
 522
 523     if (cpu->tcg_as_listener) {
 524         memory_listener_unregister(cpu->tcg_as_listener);
 525     } else {
 526         cpu->tcg_as_listener = g_new0(MemoryListener, 1);
 527     }
 528     cpu->tcg_as_listener->commit = tcg_commit;
 529     memory_listener_register(cpu->tcg_as_listener, as);
 530 }
 531 #endif
 532
 533 void cpu_exec_init(CPUArchState *env)
 534 {
 535     CPUState *cpu = ENV_GET_CPU(env);
 536     CPUClass *cc = CPU_GET_CLASS(cpu);
 537     CPUState *some_cpu;
 538     int cpu_index;
 539
 540 #if defined(CONFIG_USER_ONLY)
 541     cpu_list_lock();
 542 #endif
 543     cpu_index = 0;
 544     CPU_FOREACH(some_cpu) {
 545         cpu_index++;
 546     }
 547     cpu->cpu_index = cpu_index;
 548     cpu->numa_node = 0;
 549     QTAILQ_INIT(&cpu->breakpoints);
 550     QTAILQ_INIT(&cpu->watchpoints);
 551 #ifndef CONFIG_USER_ONLY
 552     cpu->as = &address_space_memory;
 553     cpu->thread_id = qemu_get_thread_id();
 554     cpu_reload_memory_map(cpu);
 555 #endif
 556     QTAILQ_INSERT_TAIL(&cpus, cpu, node);
 557 #if defined(CONFIG_USER_ONLY)
 558     cpu_list_unlock();
 559 #endif
 560     if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
 561         vmstate_register(NULL, cpu_index, &vmstate_cpu_common, cpu);
 562     }
 563 #if defined(CPU_SAVE_VERSION) && !defined(CONFIG_USER_ONLY)
 564     register_savevm(NULL, "cpu", cpu_index, CPU_SAVE_VERSION,
 565                     cpu_save, cpu_load, env);
 566     assert(cc->vmsd == NULL);
 567     assert(qdev_get_vmsd(DEVICE(cpu)) == NULL);
 568 #endif
 569     if (cc->vmsd != NULL) {
 570         vmstate_register(NULL, cpu_index, cc->vmsd, cpu);
 571     }
 572 }
 573
 574 #if defined(CONFIG_USER_ONLY)
 575 static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
 576 {
 577     tb_invalidate_phys_page_range(pc, pc + 1, 0);
 578 }
 579 #else
 580 static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
 581 {
 582     hwaddr phys = cpu_get_phys_page_debug(cpu, pc);
 583     if (phys != -1) {
 584         tb_invalidate_phys_addr(cpu->as,
 585                                 phys | (pc & ~TARGET_PAGE_MASK));
 586     }
 587 }
 588 #endif
 589
 590 #if defined(CONFIG_USER_ONLY)
 591 void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
 592
 593 {
 594 }
 595
 596 int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
 597                           int flags)
 598 {
 599     return -ENOSYS;
 600 }
 601
 602 void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
 603 {
 604 }
 605
 606 int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
 607                           int flags, CPUWatchpoint **watchpoint)
 608 {
 609     return -ENOSYS;
 610 }
 611 #else
 612 /* Add a watchpoint.  */
 613 int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
 614                           int flags, CPUWatchpoint **watchpoint)
 615 {
 616     CPUWatchpoint *wp;
 617
 618     /* forbid ranges which are empty or run off the end of the address space */
 619     if (len == 0 || (addr + len - 1) < addr) {
 620         error_report("tried to set invalid watchpoint at %"
 621                      VADDR_PRIx ", len=%" VADDR_PRIu, addr, len);
 622         return -EINVAL;
 623     }
 624     wp = g_malloc(sizeof(*wp));
 625
 626     wp->vaddr = addr;
 627     wp->len = len;
 628     wp->flags = flags;
 629
 630     /* keep all GDB-injected watchpoints in front */
 631     if (flags & BP_GDB) {
 632         QTAILQ_INSERT_HEAD(&cpu->watchpoints, wp, entry);
 633     } else {
 634         QTAILQ_INSERT_TAIL(&cpu->watchpoints, wp, entry);
 635     }
 636
 637     tlb_flush_page(cpu, addr);
 638
 639     if (watchpoint)
 640         *watchpoint = wp;
 641     return 0;
 642 }
 643
 644 /* Remove a specific watchpoint.  */
 645 int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
 646                           int flags)
 647 {
 648     CPUWatchpoint *wp;
 649
 650     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
 651         if (addr == wp->vaddr && len == wp->len
 652                 && flags == (wp->flags & ~BP_WATCHPOINT_HIT)) {
 653             cpu_watchpoint_remove_by_ref(cpu, wp);
 654             return 0;
 655         }
 656     }
 657     return -ENOENT;
 658 }
 659
 660 /* Remove a specific watchpoint by reference.  */
 661 void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
 662 {
 663     QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry);
 664
 665     tlb_flush_page(cpu, watchpoint->vaddr);
 666
 667     g_free(watchpoint);
 668 }
 669
 670 /* Remove all matching watchpoints.  */
 671 void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
 672 {
 673     CPUWatchpoint *wp, *next;
 674
 675     QTAILQ_FOREACH_SAFE(wp, &cpu->watchpoints, entry, next) {
 676         if (wp->flags & mask) {
 677             cpu_watchpoint_remove_by_ref(cpu, wp);
 678         }
 679     }
 680 }
 681
 682 /* Return true if this watchpoint address matches the specified
 683  * access (ie the address range covered by the watchpoint overlaps
 684  * partially or completely with the address range covered by the
 685  * access).
 686  */
 687 static inline bool cpu_watchpoint_address_matches(CPUWatchpoint *wp,
 688                                                   vaddr addr,
 689                                                   vaddr len)
 690 {
 691     /* We know the lengths are non-zero, but a little caution is
 692      * required to avoid errors in the case where the range ends
 693      * exactly at the top of the address space and so addr + len
 694      * wraps round to zero.
 695      */
 696     vaddr wpend = wp->vaddr + wp->len - 1;
 697     vaddr addrend = addr + len - 1;
 698
 699     return !(addr > wpend || wp->vaddr > addrend);
 700 }
 701
 702 #endif
 703
 704 /* Add a breakpoint.  */
 705 int cpu_breakpoint_insert(CPUState *cpu, vaddr pc, int flags,
 706                           CPUBreakpoint **breakpoint)
 707 {
 708     CPUBreakpoint *bp;
 709
 710     bp = g_malloc(sizeof(*bp));
 711
 712     bp->pc = pc;
 713     bp->flags = flags;
 714
 715     /* keep all GDB-injected breakpoints in front */
 716     if (flags & BP_GDB) {
 717         QTAILQ_INSERT_HEAD(&cpu->breakpoints, bp, entry);
 718     } else {
 719         QTAILQ_INSERT_TAIL(&cpu->breakpoints, bp, entry);
 720     }
 721
 722     breakpoint_invalidate(cpu, pc);
 723
 724     if (breakpoint) {
 725         *breakpoint = bp;
 726     }
 727     return 0;
 728 }
 729
 730 /* Remove a specific breakpoint.  */
 731 int cpu_breakpoint_remove(CPUState *cpu, vaddr pc, int flags)
 732 {
 733     CPUBreakpoint *bp;
 734
 735     QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
 736         if (bp->pc == pc && bp->flags == flags) {
 737             cpu_breakpoint_remove_by_ref(cpu, bp);
 738             return 0;
 739         }
 740     }
 741     return -ENOENT;
 742 }
 743
 744 /* Remove a specific breakpoint by reference.  */
 745 void cpu_breakpoint_remove_by_ref(CPUState *cpu, CPUBreakpoint *breakpoint)
 746 {
 747     QTAILQ_REMOVE(&cpu->breakpoints, breakpoint, entry);
 748
 749     breakpoint_invalidate(cpu, breakpoint->pc);
 750
 751     g_free(breakpoint);
 752 }
 753
 754 /* Remove all matching breakpoints. */
 755 void cpu_breakpoint_remove_all(CPUState *cpu, int mask)
 756 {
 757     CPUBreakpoint *bp, *next;
 758
 759     QTAILQ_FOREACH_SAFE(bp, &cpu->breakpoints, entry, next) {
 760         if (bp->flags & mask) {
 761             cpu_breakpoint_remove_by_ref(cpu, bp);
 762         }
 763     }
 764 }
 765
 766 /* enable or disable single step mode. EXCP_DEBUG is returned by the
 767    CPU loop after each instruction */
 768 void cpu_single_step(CPUState *cpu, int enabled)
 769 {
 770     if (cpu->singlestep_enabled != enabled) {
 771         cpu->singlestep_enabled = enabled;
 772         if (kvm_enabled()) {
 773             kvm_update_guest_debug(cpu, 0);
 774         } else {
 775             /* must flush all the translated code to avoid inconsistencies */
 776             /* XXX: only flush what is necessary */
 777             CPUArchState *env = cpu->env_ptr;
 778             tb_flush(env);
 779         }
 780     }
 781 }
 782
 783 void cpu_abort(CPUState *cpu, const char *fmt, ...)
 784 {
 785     va_list ap;
 786     va_list ap2;
 787
 788     va_start(ap, fmt);
 789     va_copy(ap2, ap);
 790     fprintf(stderr, "qemu: fatal: ");
 791     vfprintf(stderr, fmt, ap);
 792     fprintf(stderr, "\n");
 793     cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU | CPU_DUMP_CCOP);
 794     if (qemu_log_enabled()) {
 795         qemu_log("qemu: fatal: ");
 796         qemu_log_vprintf(fmt, ap2);
 797         qemu_log("\n");
 798         log_cpu_state(cpu, CPU_DUMP_FPU | CPU_DUMP_CCOP);
 799         qemu_log_flush();
 800         qemu_log_close();
 801     }
 802     va_end(ap2);
 803     va_end(ap);
 804 #if defined(CONFIG_USER_ONLY)
 805     {
 806         struct sigaction act;
 807         sigfillset(&act.sa_mask);
 808         act.sa_handler = SIG_DFL;
 809         sigaction(SIGABRT, &act, NULL);
 810     }
 811 #endif
 812     abort();
 813 }
 814
 815 #if !defined(CONFIG_USER_ONLY)
 816 /* Called from RCU critical section */
 817 static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
 818 {
 819     RAMBlock *block;
 820
 821     block = atomic_rcu_read(&ram_list.mru_block);
 822     if (block && addr - block->offset < block->max_length) {
 823         goto found;
 824     }
 825     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 826         if (addr - block->offset < block->max_length) {
 827             goto found;
 828         }
 829     }
 830
 831     fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
 832     abort();
 833
 834 found:
 835     /* It is safe to write mru_block outside the iothread lock.  This
 836      * is what happens:
 837      *
 838      *     mru_block = xxx
 839      *     rcu_read_unlock()
 840      *                                        xxx removed from list
 841      *                  rcu_read_lock()
 842      *                  read mru_block
 843      *                                        mru_block = NULL;
 844      *                                        call_rcu(reclaim_ramblock, xxx);
 845      *                  rcu_read_unlock()
 846      *
 847      * atomic_rcu_set is not needed here.  The block was already published
 848      * when it was placed into the list.  Here we're just making an extra
 849      * copy of the pointer.
 850      */
 851     ram_list.mru_block = block;
 852     return block;
 853 }
 854
 855 static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
 856 {
 857     ram_addr_t start1;
 858     RAMBlock *block;
 859     ram_addr_t end;
 860
 861     end = TARGET_PAGE_ALIGN(start + length);
 862     start &= TARGET_PAGE_MASK;
 863
 864     rcu_read_lock();
 865     block = qemu_get_ram_block(start);
 866     assert(block == qemu_get_ram_block(end - 1));
 867     start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
 868     cpu_tlb_reset_dirty_all(start1, length);
 869     rcu_read_unlock();
 870 }
 871
 872 /* Note: start and end must be within the same ram block.  */
 873 void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t length,
 874                                      unsigned client)
 875 {
 876     if (length == 0)
 877         return;
 878     cpu_physical_memory_clear_dirty_range_type(start, length, client);
 879
 880     if (tcg_enabled()) {
 881         tlb_reset_dirty_range_all(start, length);
 882     }
 883 }
 884
 885 static void cpu_physical_memory_set_dirty_tracking(bool enable)
 886 {
 887     in_migration = enable;
 888 }
 889
 890 /* Called from RCU critical section */
 891 hwaddr memory_region_section_get_iotlb(CPUState *cpu,
 892                                        MemoryRegionSection *section,
 893                                        target_ulong vaddr,
 894                                        hwaddr paddr, hwaddr xlat,
 895                                        int prot,
 896                                        target_ulong *address)
 897 {
 898     hwaddr iotlb;
 899     CPUWatchpoint *wp;
 900
 901     if (memory_region_is_ram(section->mr)) {
 902         /* Normal RAM.  */
 903         iotlb = (memory_region_get_ram_addr(section->mr) & TARGET_PAGE_MASK)
 904             + xlat;
 905         if (!section->readonly) {
 906             iotlb |= PHYS_SECTION_NOTDIRTY;
 907         } else {
 908             iotlb |= PHYS_SECTION_ROM;
 909         }
 910     } else {
 911         iotlb = section - section->address_space->dispatch->map.sections;
 912         iotlb += xlat;
 913     }
 914
 915     /* Make accesses to pages with watchpoints go via the
 916        watchpoint trap routines.  */
 917     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
 918         if (cpu_watchpoint_address_matches(wp, vaddr, TARGET_PAGE_SIZE)) {
 919             /* Avoid trapping reads of pages with a write breakpoint. */
 920             if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) {
 921                 iotlb = PHYS_SECTION_WATCH + paddr;
 922                 *address |= TLB_MMIO;
 923                 break;
 924             }
 925         }
 926     }
 927
 928     return iotlb;
 929 }
 930 #endif /* defined(CONFIG_USER_ONLY) */
 931
 932 #if !defined(CONFIG_USER_ONLY)
 933
 934 static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
 935                              uint16_t section);
 936 static subpage_t *subpage_init(AddressSpace *as, hwaddr base);
 937
 938 static void *(*phys_mem_alloc)(size_t size, uint64_t *align) =
 939                                qemu_anon_ram_alloc;
 940
 941 /*
 942  * Set a custom physical guest memory alloator.
 943  * Accelerators with unusual needs may need this.  Hopefully, we can
 944  * get rid of it eventually.
 945  */
 946 void phys_mem_set_alloc(void *(*alloc)(size_t, uint64_t *align))
 947 {
 948     phys_mem_alloc = alloc;
 949 }
 950
 951 static uint16_t phys_section_add(PhysPageMap *map,
 952                                  MemoryRegionSection *section)
 953 {
 954     /* The physical section number is ORed with a page-aligned
 955      * pointer to produce the iotlb entries.  Thus it should
 956      * never overflow into the page-aligned value.
 957      */
 958     assert(map->sections_nb < TARGET_PAGE_SIZE);
 959
 960     if (map->sections_nb == map->sections_nb_alloc) {
 961         map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
 962         map->sections = g_renew(MemoryRegionSection, map->sections,
 963                                 map->sections_nb_alloc);
 964     }
 965     map->sections[map->sections_nb] = *section;
 966     memory_region_ref(section->mr);
 967     return map->sections_nb++;
 968 }
 969
 970 static void phys_section_destroy(MemoryRegion *mr)
 971 {
 972     memory_region_unref(mr);
 973
 974     if (mr->subpage) {
 975         subpage_t *subpage = container_of(mr, subpage_t, iomem);
 976         object_unref(OBJECT(&subpage->iomem));
 977         g_free(subpage);
 978     }
 979 }
 980
 981 static void phys_sections_free(PhysPageMap *map)
 982 {
 983     while (map->sections_nb > 0) {
 984         MemoryRegionSection *section = &map->sections[--map->sections_nb];
 985         phys_section_destroy(section->mr);
 986     }
 987     g_free(map->sections);
 988     g_free(map->nodes);
 989 }
 990
 991 static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *section)
 992 {
 993     subpage_t *subpage;
 994     hwaddr base = section->offset_within_address_space
 995         & TARGET_PAGE_MASK;
 996     MemoryRegionSection *existing = phys_page_find(d->phys_map, base,
 997                                                    d->map.nodes, d->map.sections);
 998     MemoryRegionSection subsection = {
 999         .offset_within_address_space = base,
1000         .size = int128_make64(TARGET_PAGE_SIZE),
1001     };
1002     hwaddr start, end;
1003
1004     assert(existing->mr->subpage || existing->mr == &io_mem_unassigned);
1005
1006     if (!(existing->mr->subpage)) {
1007         subpage = subpage_init(d->as, base);
1008         subsection.address_space = d->as;
1009         subsection.mr = &subpage->iomem;
1010         phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
1011                       phys_section_add(&d->map, &subsection));
1012     } else {
1013         subpage = container_of(existing->mr, subpage_t, iomem);
1014     }
1015     start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
1016     end = start + int128_get64(section->size) - 1;
1017     subpage_register(subpage, start, end,
1018                      phys_section_add(&d->map, section));
1019 }
1020
1021
1022 static void register_multipage(AddressSpaceDispatch *d,
1023                                MemoryRegionSection *section)
1024 {
1025     hwaddr start_addr = section->offset_within_address_space;
1026     uint16_t section_index = phys_section_add(&d->map, section);
1027     uint64_t num_pages = int128_get64(int128_rshift(section->size,
1028                                                     TARGET_PAGE_BITS));
1029
1030     assert(num_pages);
1031     phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
1032 }
1033
1034 static void mem_add(MemoryListener *listener, MemoryRegionSection *section)
1035 {
1036     AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
1037     AddressSpaceDispatch *d = as->next_dispatch;
1038     MemoryRegionSection now = *section, remain = *section;
1039     Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
1040
1041     if (now.offset_within_address_space & ~TARGET_PAGE_MASK) {
1042         uint64_t left = TARGET_PAGE_ALIGN(now.offset_within_address_space)
1043                        - now.offset_within_address_space;
1044
1045         now.size = int128_min(int128_make64(left), now.size);
1046         register_subpage(d, &now);
1047     } else {
1048         now.size = int128_zero();
1049     }
1050     while (int128_ne(remain.size, now.size)) {
1051         remain.size = int128_sub(remain.size, now.size);
1052         remain.offset_within_address_space += int128_get64(now.size);
1053         remain.offset_within_region += int128_get64(now.size);
1054         now = remain;
1055         if (int128_lt(remain.size, page_size)) {
1056             register_subpage(d, &now);
1057         } else if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
1058             now.size = page_size;
1059             register_subpage(d, &now);
1060         } else {
1061             now.size = int128_and(now.size, int128_neg(page_size));
1062             register_multipage(d, &now);
1063         }
1064     }
1065 }
1066
1067 void qemu_flush_coalesced_mmio_buffer(void)
1068 {
1069     if (kvm_enabled())
1070         kvm_flush_coalesced_mmio_buffer();
1071 }
1072
1073 void qemu_mutex_lock_ramlist(void)
1074 {
1075     qemu_mutex_lock(&ram_list.mutex);
1076 }
1077
1078 void qemu_mutex_unlock_ramlist(void)
1079 {
1080     qemu_mutex_unlock(&ram_list.mutex);
1081 }
1082
1083 #ifdef __linux__
1084
1085 #include <sys/vfs.h>
1086
1087 #define HUGETLBFS_MAGIC       0x958458f6
1088
1089 static long gethugepagesize(const char *path, Error **errp)
1090 {
1091     struct statfs fs;
1092     int ret;
1093
1094     do {
1095         ret = statfs(path, &fs);
1096     } while (ret != 0 && errno == EINTR);
1097
1098     if (ret != 0) {
1099         error_setg_errno(errp, errno, "failed to get page size of file %s",
1100                          path);
1101         return 0;
1102     }
1103
1104     if (fs.f_type != HUGETLBFS_MAGIC)
1105         fprintf(stderr, "Warning: path not on HugeTLBFS: %s\n", path);
1106
1107     return fs.f_bsize;
1108 }
1109
1110 static void *file_ram_alloc(RAMBlock *block,
1111                             ram_addr_t memory,
1112                             const char *path,
1113                             Error **errp)
1114 {
1115     char *filename;
1116     char *sanitized_name;
1117     char *c;
1118     void *area = NULL;
1119     int fd;
1120     uint64_t hpagesize;
1121     Error *local_err = NULL;
1122
1123     hpagesize = gethugepagesize(path, &local_err);
1124     if (local_err) {
1125         error_propagate(errp, local_err);
1126         goto error;
1127     }
1128     block->mr->align = hpagesize;
1129
1130     if (memory < hpagesize) {
1131         error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
1132                    "or larger than huge page size 0x%" PRIx64,
1133                    memory, hpagesize);
1134         goto error;
1135     }
1136
1137     if (kvm_enabled() && !kvm_has_sync_mmu()) {
1138         error_setg(errp,
1139                    "host lacks kvm mmu notifiers, -mem-path unsupported");
1140         goto error;
1141     }
1142
1143     /* Make name safe to use with mkstemp by replacing '/' with '_'. */
1144     sanitized_name = g_strdup(memory_region_name(block->mr));
1145     for (c = sanitized_name; *c != '\0'; c++) {
1146         if (*c == '/')
1147             *c = '_';
1148     }
1149
1150     filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
1151                                sanitized_name);
1152     g_free(sanitized_name);
1153
1154     fd = mkstemp(filename);
1155     if (fd < 0) {
1156         error_setg_errno(errp, errno,
1157                          "unable to create backing store for hugepages");
1158         g_free(filename);
1159         goto error;
1160     }
1161     unlink(filename);
1162     g_free(filename);
1163
1164     memory = (memory+hpagesize-1) & ~(hpagesize-1);
1165
1166     /*
1167      * ftruncate is not supported by hugetlbfs in older
1168      * hosts, so don't bother bailing out on errors.
1169      * If anything goes wrong with it under other filesystems,
1170      * mmap will fail.
1171      */
1172     if (ftruncate(fd, memory)) {
1173         perror("ftruncate");
1174     }
1175
1176     area = mmap(0, memory, PROT_READ | PROT_WRITE,
1177                 (block->flags & RAM_SHARED ? MAP_SHARED : MAP_PRIVATE),
1178                 fd, 0);
1179     if (area == MAP_FAILED) {
1180         error_setg_errno(errp, errno,
1181                          "unable to map backing store for hugepages");
1182         close(fd);
1183         goto error;
1184     }
1185
1186     if (mem_prealloc) {
1187         os_mem_prealloc(fd, area, memory);
1188     }
1189
1190     block->fd = fd;
1191     return area;
1192
1193 error:
1194     if (mem_prealloc) {
1195         error_report("%s", error_get_pretty(*errp));
1196         exit(1);
1197     }
1198     return NULL;
1199 }
1200 #endif
1201
1202 /* Called with the ramlist lock held.  */
1203 static ram_addr_t find_ram_offset(ram_addr_t size)
1204 {
1205     RAMBlock *block, *next_block;
1206     ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
1207
1208     assert(size != 0); /* it would hand out same offset multiple times */
1209
1210     if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
1211         return 0;
1212     }
1213
1214     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1215         ram_addr_t end, next = RAM_ADDR_MAX;
1216
1217         end = block->offset + block->max_length;
1218
1219         QLIST_FOREACH_RCU(next_block, &ram_list.blocks, next) {
1220             if (next_block->offset >= end) {
1221                 next = MIN(next, next_block->offset);
1222             }
1223         }
1224         if (next - end >= size && next - end < mingap) {
1225             offset = end;
1226             mingap = next - end;
1227         }
1228     }
1229
1230     if (offset == RAM_ADDR_MAX) {
1231         fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
1232                 (uint64_t)size);
1233         abort();
1234     }
1235
1236     return offset;
1237 }
1238
1239 ram_addr_t last_ram_offset(void)
1240 {
1241     RAMBlock *block;
1242     ram_addr_t last = 0;
1243
1244     rcu_read_lock();
1245     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1246         last = MAX(last, block->offset + block->max_length);
1247     }
1248     rcu_read_unlock();
1249     return last;
1250 }
1251
1252 static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
1253 {
1254     int ret;
1255
1256     /* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
1257     if (!machine_dump_guest_core(current_machine)) {
1258         ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
1259         if (ret) {
1260             perror("qemu_madvise");
1261             fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
1262                             "but dump_guest_core=off specified\n");
1263         }
1264     }
1265 }
1266
1267 /* Called within an RCU critical section, or while the ramlist lock
1268  * is held.
1269  */
1270 static RAMBlock *find_ram_block(ram_addr_t addr)
1271 {
1272     RAMBlock *block;
1273
1274     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1275         if (block->offset == addr) {
1276             return block;
1277         }
1278     }
1279
1280     return NULL;
1281 }
1282
1283 /* Called with iothread lock held.  */
1284 void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev)
1285 {
1286     RAMBlock *new_block, *block;
1287
1288     rcu_read_lock();
1289     new_block = find_ram_block(addr);
1290     assert(new_block);
1291     assert(!new_block->idstr[0]);
1292
1293     if (dev) {
1294         char *id = qdev_get_dev_path(dev);
1295         if (id) {
1296             snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
1297             g_free(id);
1298         }
1299     }
1300     pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
1301
1302     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1303         if (block != new_block && !strcmp(block->idstr, new_block->idstr)) {
1304             fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
1305                     new_block->idstr);
1306             abort();
1307         }
1308     }
1309     rcu_read_unlock();
1310 }
1311
1312 /* Called with iothread lock held.  */
1313 void qemu_ram_unset_idstr(ram_addr_t addr)
1314 {
1315     RAMBlock *block;
1316
1317     /* FIXME: arch_init.c assumes that this is not called throughout
1318      * migration.  Ignore the problem since hot-unplug during migration
1319      * does not work anyway.
1320      */
1321
1322     rcu_read_lock();
1323     block = find_ram_block(addr);
1324     if (block) {
1325         memset(block->idstr, 0, sizeof(block->idstr));
1326     }
1327     rcu_read_unlock();
1328 }
1329
1330 static int memory_try_enable_merging(void *addr, size_t len)
1331 {
1332     if (!machine_mem_merge(current_machine)) {
1333         /* disabled by the user */
1334         return 0;
1335     }
1336
1337     return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
1338 }
1339
1340 /* Only legal before guest might have detected the memory size: e.g. on
1341  * incoming migration, or right after reset.
1342  *
1343  * As memory core doesn't know how is memory accessed, it is up to
1344  * resize callback to update device state and/or add assertions to detect
1345  * misuse, if necessary.
1346  */
1347 int qemu_ram_resize(ram_addr_t base, ram_addr_t newsize, Error **errp)
1348 {
1349     RAMBlock *block = find_ram_block(base);
1350
1351     assert(block);
1352
1353     newsize = TARGET_PAGE_ALIGN(newsize);
1354
1355     if (block->used_length == newsize) {
1356         return 0;
1357     }
1358
1359     if (!(block->flags & RAM_RESIZEABLE)) {
1360         error_setg_errno(errp, EINVAL,
1361                          "Length mismatch: %s: 0x" RAM_ADDR_FMT
1362                          " in != 0x" RAM_ADDR_FMT, block->idstr,
1363                          newsize, block->used_length);
1364         return -EINVAL;
1365     }
1366
1367     if (block->max_length < newsize) {
1368         error_setg_errno(errp, EINVAL,
1369                          "Length too large: %s: 0x" RAM_ADDR_FMT
1370                          " > 0x" RAM_ADDR_FMT, block->idstr,
1371                          newsize, block->max_length);
1372         return -EINVAL;
1373     }
1374
1375     cpu_physical_memory_clear_dirty_range(block->offset, block->used_length);
1376     block->used_length = newsize;
1377     cpu_physical_memory_set_dirty_range(block->offset, block->used_length);
1378     memory_region_set_size(block->mr, newsize);
1379     if (block->resized) {
1380         block->resized(block->idstr, newsize, block->host);
1381     }
1382     return 0;
1383 }
1384
1385 static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp)
1386 {
1387     RAMBlock *block;
1388     RAMBlock *last_block = NULL;
1389     ram_addr_t old_ram_size, new_ram_size;
1390
1391     old_ram_size = last_ram_offset() >> TARGET_PAGE_BITS;
1392
1393     qemu_mutex_lock_ramlist();
1394     new_block->offset = find_ram_offset(new_block->max_length);
1395
1396     if (!new_block->host) {
1397         if (xen_enabled()) {
1398             xen_ram_alloc(new_block->offset, new_block->max_length,
1399                           new_block->mr);
1400         } else {
1401             new_block->host = phys_mem_alloc(new_block->max_length,
1402                                              &new_block->mr->align);
1403             if (!new_block->host) {
1404                 error_setg_errno(errp, errno,
1405                                  "cannot set up guest memory '%s'",
1406                                  memory_region_name(new_block->mr));
1407                 qemu_mutex_unlock_ramlist();
1408                 return -1;
1409             }
1410             memory_try_enable_merging(new_block->host, new_block->max_length);
1411         }
1412     }
1413
1414     /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
1415      * QLIST (which has an RCU-friendly variant) does not have insertion at
1416      * tail, so save the last element in last_block.
1417      */
1418     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1419         last_block = block;
1420         if (block->max_length < new_block->max_length) {
1421             break;
1422         }
1423     }
1424     if (block) {
1425         QLIST_INSERT_BEFORE_RCU(block, new_block, next);
1426     } else if (last_block) {
1427         QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
1428     } else { /* list is empty */
1429         QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
1430     }
1431     ram_list.mru_block = NULL;
1432
1433     /* Write list before version */
1434     smp_wmb();
1435     ram_list.version++;
1436     qemu_mutex_unlock_ramlist();
1437
1438     new_ram_size = last_ram_offset() >> TARGET_PAGE_BITS;
1439
1440     if (new_ram_size > old_ram_size) {
1441         int i;
1442
1443         /* ram_list.dirty_memory[] is protected by the iothread lock.  */
1444         for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
1445             ram_list.dirty_memory[i] =
1446                 bitmap_zero_extend(ram_list.dirty_memory[i],
1447                                    old_ram_size, new_ram_size);
1448        }
1449     }
1450     cpu_physical_memory_set_dirty_range(new_block->offset,
1451                                         new_block->used_length);
1452
1453     if (new_block->host) {
1454         qemu_ram_setup_dump(new_block->host, new_block->max_length);
1455         qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
1456         qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_DONTFORK);
1457         if (kvm_enabled()) {
1458             kvm_setup_guest_memory(new_block->host, new_block->max_length);
1459         }
1460     }
1461
1462     return new_block->offset;
1463 }
1464
1465 #ifdef __linux__
1466 ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
1467                                     bool share, const char *mem_path,
1468                                     Error **errp)
1469 {
1470     RAMBlock *new_block;
1471     ram_addr_t addr;
1472     Error *local_err = NULL;
1473
1474     if (xen_enabled()) {
1475         error_setg(errp, "-mem-path not supported with Xen");
1476         return -1;
1477     }
1478
1479     if (phys_mem_alloc != qemu_anon_ram_alloc) {
1480         /*
1481          * file_ram_alloc() needs to allocate just like
1482          * phys_mem_alloc, but we haven't bothered to provide
1483          * a hook there.
1484          */
1485         error_setg(errp,
1486                    "-mem-path not supported with this accelerator");
1487         return -1;
1488     }
1489
1490     size = TARGET_PAGE_ALIGN(size);
1491     new_block = g_malloc0(sizeof(*new_block));
1492     new_block->mr = mr;
1493     new_block->used_length = size;
1494     new_block->max_length = size;
1495     new_block->flags = share ? RAM_SHARED : 0;
1496     new_block->host = file_ram_alloc(new_block, size,
1497                                      mem_path, errp);
1498     if (!new_block->host) {
1499         g_free(new_block);
1500         return -1;
1501     }
1502
1503     addr = ram_block_add(new_block, &local_err);
1504     if (local_err) {
1505         g_free(new_block);
1506         error_propagate(errp, local_err);
1507         return -1;
1508     }
1509     return addr;
1510 }
1511 #endif
1512
1513 static
1514 ram_addr_t qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
1515                                    void (*resized)(const char*,
1516                                                    uint64_t length,
1517                                                    void *host),
1518                                    void *host, bool resizeable,
1519                                    MemoryRegion *mr, Error **errp)
1520 {
1521     RAMBlock *new_block;
1522     ram_addr_t addr;
1523     Error *local_err = NULL;
1524
1525     size = TARGET_PAGE_ALIGN(size);
1526     max_size = TARGET_PAGE_ALIGN(max_size);
1527     new_block = g_malloc0(sizeof(*new_block));
1528     new_block->mr = mr;
1529     new_block->resized = resized;
1530     new_block->used_length = size;
1531     new_block->max_length = max_size;
1532     assert(max_size >= size);
1533     new_block->fd = -1;
1534     new_block->host = host;
1535     if (host) {
1536         new_block->flags |= RAM_PREALLOC;
1537     }
1538     if (resizeable) {
1539         new_block->flags |= RAM_RESIZEABLE;
1540     }
1541     addr = ram_block_add(new_block, &local_err);
1542     if (local_err) {
1543         g_free(new_block);
1544         error_propagate(errp, local_err);
1545         return -1;
1546     }
1547     return addr;
1548 }
1549
1550 ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
1551                                    MemoryRegion *mr, Error **errp)
1552 {
1553     return qemu_ram_alloc_internal(size, size, NULL, host, false, mr, errp);
1554 }
1555
1556 ram_addr_t qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr, Error **errp)
1557 {
1558     return qemu_ram_alloc_internal(size, size, NULL, NULL, false, mr, errp);
1559 }
1560
1561 ram_addr_t qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
1562                                      void (*resized)(const char*,
1563                                                      uint64_t length,
1564                                                      void *host),
1565                                      MemoryRegion *mr, Error **errp)
1566 {
1567     return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true, mr, errp);
1568 }
1569
1570 void qemu_ram_free_from_ptr(ram_addr_t addr)
1571 {
1572     RAMBlock *block;
1573
1574     qemu_mutex_lock_ramlist();
1575     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1576         if (addr == block->offset) {
1577             QLIST_REMOVE_RCU(block, next);
1578             ram_list.mru_block = NULL;
1579             /* Write list before version */
1580             smp_wmb();
1581             ram_list.version++;
1582             g_free_rcu(block, rcu);
1583             break;
1584         }
1585     }
1586     qemu_mutex_unlock_ramlist();
1587 }
1588
1589 static void reclaim_ramblock(RAMBlock *block)
1590 {
1591     if (block->flags & RAM_PREALLOC) {
1592         ;
1593     } else if (xen_enabled()) {
1594         xen_invalidate_map_cache_entry(block->host);
1595 #ifndef _WIN32
1596     } else if (block->fd >= 0) {
1597         munmap(block->host, block->max_length);
1598         close(block->fd);
1599 #endif
1600     } else {
1601         qemu_anon_ram_free(block->host, block->max_length);
1602     }
1603     g_free(block);
1604 }
1605
1606 void qemu_ram_free(ram_addr_t addr)
1607 {
1608     RAMBlock *block;
1609
1610     qemu_mutex_lock_ramlist();
1611     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1612         if (addr == block->offset) {
1613             QLIST_REMOVE_RCU(block, next);
1614             ram_list.mru_block = NULL;
1615             /* Write list before version */
1616             smp_wmb();
1617             ram_list.version++;
1618             call_rcu(block, reclaim_ramblock, rcu);
1619             break;
1620         }
1621     }
1622     qemu_mutex_unlock_ramlist();
1623 }
1624
1625 #ifndef _WIN32
1626 void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
1627 {
1628     RAMBlock *block;
1629     ram_addr_t offset;
1630     int flags;
1631     void *area, *vaddr;
1632
1633     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1634         offset = addr - block->offset;
1635         if (offset < block->max_length) {
1636             vaddr = ramblock_ptr(block, offset);
1637             if (block->flags & RAM_PREALLOC) {
1638                 ;
1639             } else if (xen_enabled()) {
1640                 abort();
1641             } else {
1642                 flags = MAP_FIXED;
1643                 munmap(vaddr, length);
1644                 if (block->fd >= 0) {
1645                     flags |= (block->flags & RAM_SHARED ?
1646                               MAP_SHARED : MAP_PRIVATE);
1647                     area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
1648                                 flags, block->fd, offset);
1649                 } else {
1650                     /*
1651                      * Remap needs to match alloc.  Accelerators that
1652                      * set phys_mem_alloc never remap.  If they did,
1653                      * we'd need a remap hook here.
1654                      */
1655                     assert(phys_mem_alloc == qemu_anon_ram_alloc);
1656
1657                     flags |= MAP_PRIVATE | MAP_ANONYMOUS;
1658                     area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
1659                                 flags, -1, 0);
1660                 }
1661                 if (area != vaddr) {
1662                     fprintf(stderr, "Could not remap addr: "
1663                             RAM_ADDR_FMT "@" RAM_ADDR_FMT "\n",
1664                             length, addr);
1665                     exit(1);
1666                 }
1667                 memory_try_enable_merging(vaddr, length);
1668                 qemu_ram_setup_dump(vaddr, length);
1669             }
1670         }
1671     }
1672 }
1673 #endif /* !_WIN32 */
1674
1675 int qemu_get_ram_fd(ram_addr_t addr)
1676 {
1677     RAMBlock *block;
1678     int fd;
1679
1680     rcu_read_lock();
1681     block = qemu_get_ram_block(addr);
1682     fd = block->fd;
1683     rcu_read_unlock();
1684     return fd;
1685 }
1686
1687 void *qemu_get_ram_block_host_ptr(ram_addr_t addr)
1688 {
1689     RAMBlock *block;
1690     void *ptr;
1691
1692     rcu_read_lock();
1693     block = qemu_get_ram_block(addr);
1694     ptr = ramblock_ptr(block, 0);
1695     rcu_read_unlock();
1696     return ptr;
1697 }
1698
1699 /* Return a host pointer to ram allocated with qemu_ram_alloc.
1700  * This should not be used for general purpose DMA.  Use address_space_map
1701  * or address_space_rw instead. For local memory (e.g. video ram) that the
1702  * device owns, use memory_region_get_ram_ptr.
1703  *
1704  * By the time this function returns, the returned pointer is not protected
1705  * by RCU anymore.  If the caller is not within an RCU critical section and
1706  * does not hold the iothread lock, it must have other means of protecting the
1707  * pointer, such as a reference to the region that includes the incoming
1708  * ram_addr_t.
1709  */
1710 void *qemu_get_ram_ptr(ram_addr_t addr)
1711 {
1712     RAMBlock *block;
1713     void *ptr;
1714
1715     rcu_read_lock();
1716     block = qemu_get_ram_block(addr);
1717
1718     if (xen_enabled() && block->host == NULL) {
1719         /* We need to check if the requested address is in the RAM
1720          * because we don't want to map the entire memory in QEMU.
1721          * In that case just map until the end of the page.
1722          */
1723         if (block->offset == 0) {
1724             ptr = xen_map_cache(addr, 0, 0);
1725             goto unlock;
1726         }
1727
1728         block->host = xen_map_cache(block->offset, block->max_length, 1);
1729     }
1730     ptr = ramblock_ptr(block, addr - block->offset);
1731
1732 unlock:
1733     rcu_read_unlock();
1734     return ptr;
1735 }
1736
1737 /* Return a host pointer to guest's ram. Similar to qemu_get_ram_ptr
1738  * but takes a size argument.
1739  *
1740  * By the time this function returns, the returned pointer is not protected
1741  * by RCU anymore.  If the caller is not within an RCU critical section and
1742  * does not hold the iothread lock, it must have other means of protecting the
1743  * pointer, such as a reference to the region that includes the incoming
1744  * ram_addr_t.
1745  */
1746 static void *qemu_ram_ptr_length(ram_addr_t addr, hwaddr *size)
1747 {
1748     void *ptr;
1749     if (*size == 0) {
1750         return NULL;
1751     }
1752     if (xen_enabled()) {
1753         return xen_map_cache(addr, *size, 1);
1754     } else {
1755         RAMBlock *block;
1756         rcu_read_lock();
1757         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1758             if (addr - block->offset < block->max_length) {
1759                 if (addr - block->offset + *size > block->max_length)
1760                     *size = block->max_length - addr + block->offset;
1761                 ptr = ramblock_ptr(block, addr - block->offset);
1762                 rcu_read_unlock();
1763                 return ptr;
1764             }
1765         }
1766
1767         fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
1768         abort();
1769     }
1770 }
1771
1772 /* Some of the softmmu routines need to translate from a host pointer
1773  * (typically a TLB entry) back to a ram offset.
1774  *
1775  * By the time this function returns, the returned pointer is not protected
1776  * by RCU anymore.  If the caller is not within an RCU critical section and
1777  * does not hold the iothread lock, it must have other means of protecting the
1778  * pointer, such as a reference to the region that includes the incoming
1779  * ram_addr_t.
1780  */
1781 MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
1782 {
1783     RAMBlock *block;
1784     uint8_t *host = ptr;
1785     MemoryRegion *mr;
1786
1787     if (xen_enabled()) {
1788         rcu_read_lock();
1789         *ram_addr = xen_ram_addr_from_mapcache(ptr);
1790         mr = qemu_get_ram_block(*ram_addr)->mr;
1791         rcu_read_unlock();
1792         return mr;
1793     }
1794
1795     rcu_read_lock();
1796     block = atomic_rcu_read(&ram_list.mru_block);
1797     if (block && block->host && host - block->host < block->max_length) {
1798         goto found;
1799     }
1800
1801     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1802         /* This case append when the block is not mapped. */
1803         if (block->host == NULL) {
1804             continue;
1805         }
1806         if (host - block->host < block->max_length) {
1807             goto found;
1808         }
1809     }
1810
1811     rcu_read_unlock();
1812     return NULL;
1813
1814 found:
1815     *ram_addr = block->offset + (host - block->host);
1816     mr = block->mr;
1817     rcu_read_unlock();
1818     return mr;
1819 }
1820
1821 static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
1822                                uint64_t val, unsigned size)
1823 {
1824     if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
1825         tb_invalidate_phys_page_fast(ram_addr, size);
1826     }
1827     switch (size) {
1828     case 1:
1829         stb_p(qemu_get_ram_ptr(ram_addr), val);
1830         break;
1831     case 2:
1832         stw_p(qemu_get_ram_ptr(ram_addr), val);
1833         break;
1834     case 4:
1835         stl_p(qemu_get_ram_ptr(ram_addr), val);
1836         break;
1837     default:
1838         abort();
1839     }
1840     cpu_physical_memory_set_dirty_range_nocode(ram_addr, size);
1841     /* we remove the notdirty callback only if the code has been
1842        flushed */
1843     if (!cpu_physical_memory_is_clean(ram_addr)) {
1844         CPUArchState *env = current_cpu->env_ptr;
1845         tlb_set_dirty(env, current_cpu->mem_io_vaddr);
1846     }
1847 }
1848
1849 static bool notdirty_mem_accepts(void *opaque, hwaddr addr,
1850                                  unsigned size, bool is_write)
1851 {
1852     return is_write;
1853 }
1854
1855 static const MemoryRegionOps notdirty_mem_ops = {
1856     .write = notdirty_mem_write,
1857     .valid.accepts = notdirty_mem_accepts,
1858     .endianness = DEVICE_NATIVE_ENDIAN,
1859 };
1860
1861 /* Generate a debug exception if a watchpoint has been hit.  */
1862 static void check_watchpoint(int offset, int len, int flags)
1863 {
1864     CPUState *cpu = current_cpu;
1865     CPUArchState *env = cpu->env_ptr;
1866     target_ulong pc, cs_base;
1867     target_ulong vaddr;
1868     CPUWatchpoint *wp;
1869     int cpu_flags;
1870
1871     if (cpu->watchpoint_hit) {
1872         /* We re-entered the check after replacing the TB. Now raise
1873          * the debug interrupt so that is will trigger after the
1874          * current instruction. */
1875         cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
1876         return;
1877     }
1878     vaddr = (cpu->mem_io_vaddr & TARGET_PAGE_MASK) + offset;
1879     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
1880         if (cpu_watchpoint_address_matches(wp, vaddr, len)
1881             && (wp->flags & flags)) {
1882             if (flags == BP_MEM_READ) {
1883                 wp->flags |= BP_WATCHPOINT_HIT_READ;
1884             } else {
1885                 wp->flags |= BP_WATCHPOINT_HIT_WRITE;
1886             }
1887             wp->hitaddr = vaddr;
1888             if (!cpu->watchpoint_hit) {
1889                 cpu->watchpoint_hit = wp;
1890                 tb_check_watchpoint(cpu);
1891                 if (wp->flags & BP_STOP_BEFORE_ACCESS) {
1892                     cpu->exception_index = EXCP_DEBUG;
1893                     cpu_loop_exit(cpu);
1894                 } else {
1895                     cpu_get_tb_cpu_state(env, &pc, &cs_base, &cpu_flags);
1896                     tb_gen_code(cpu, pc, cs_base, cpu_flags, 1);
1897                     cpu_resume_from_signal(cpu, NULL);
1898                 }
1899             }
1900         } else {
1901             wp->flags &= ~BP_WATCHPOINT_HIT;
1902         }
1903     }
1904 }
1905
1906 /* Watchpoint access routines.  Watchpoints are inserted using TLB tricks,
1907    so these check for a hit then pass through to the normal out-of-line
1908    phys routines.  */
1909 static uint64_t watch_mem_read(void *opaque, hwaddr addr,
1910                                unsigned size)
1911 {
1912     check_watchpoint(addr & ~TARGET_PAGE_MASK, size, BP_MEM_READ);
1913     switch (size) {
1914     case 1: return ldub_phys(&address_space_memory, addr);
1915     case 2: return lduw_phys(&address_space_memory, addr);
1916     case 4: return ldl_phys(&address_space_memory, addr);
1917     default: abort();
1918     }
1919 }
1920
1921 static void watch_mem_write(void *opaque, hwaddr addr,
1922                             uint64_t val, unsigned size)
1923 {
1924     check_watchpoint(addr & ~TARGET_PAGE_MASK, size, BP_MEM_WRITE);
1925     switch (size) {
1926     case 1:
1927         stb_phys(&address_space_memory, addr, val);
1928         break;
1929     case 2:
1930         stw_phys(&address_space_memory, addr, val);
1931         break;
1932     case 4:
1933         stl_phys(&address_space_memory, addr, val);
1934         break;
1935     default: abort();
1936     }
1937 }
1938
1939 static const MemoryRegionOps watch_mem_ops = {
1940     .read = watch_mem_read,
1941     .write = watch_mem_write,
1942     .endianness = DEVICE_NATIVE_ENDIAN,
1943 };
1944
1945 static uint64_t subpage_read(void *opaque, hwaddr addr,
1946                              unsigned len)
1947 {
1948     subpage_t *subpage = opaque;
1949     uint8_t buf[8];
1950
1951 #if defined(DEBUG_SUBPAGE)
1952     printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__,
1953            subpage, len, addr);
1954 #endif
1955     address_space_read(subpage->as, addr + subpage->base, buf, len);
1956     switch (len) {
1957     case 1:
1958         return ldub_p(buf);
1959     case 2:
1960         return lduw_p(buf);
1961     case 4:
1962         return ldl_p(buf);
1963     case 8:
1964         return ldq_p(buf);
1965     default:
1966         abort();
1967     }
1968 }
1969
1970 static void subpage_write(void *opaque, hwaddr addr,
1971                           uint64_t value, unsigned len)
1972 {
1973     subpage_t *subpage = opaque;
1974     uint8_t buf[8];
1975
1976 #if defined(DEBUG_SUBPAGE)
1977     printf("%s: subpage %p len %u addr " TARGET_FMT_plx
1978            " value %"PRIx64"\n",
1979            __func__, subpage, len, addr, value);
1980 #endif
1981     switch (len) {
1982     case 1:
1983         stb_p(buf, value);
1984         break;
1985     case 2:
1986         stw_p(buf, value);
1987         break;
1988     case 4:
1989         stl_p(buf, value);
1990         break;
1991     case 8:
1992         stq_p(buf, value);
1993         break;
1994     default:
1995         abort();
1996     }
1997     address_space_write(subpage->as, addr + subpage->base, buf, len);
1998 }
1999
2000 static bool subpage_accepts(void *opaque, hwaddr addr,
2001                             unsigned len, bool is_write)
2002 {
2003     subpage_t *subpage = opaque;
2004 #if defined(DEBUG_SUBPAGE)
2005     printf("%s: subpage %p %c len %u addr " TARGET_FMT_plx "\n",
2006            __func__, subpage, is_write ? 'w' : 'r', len, addr);
2007 #endif
2008
2009     return address_space_access_valid(subpage->as, addr + subpage->base,
2010                                       len, is_write);
2011 }
2012
2013 static const MemoryRegionOps subpage_ops = {
2014     .read = subpage_read,
2015     .write = subpage_write,
2016     .impl.min_access_size = 1,
2017     .impl.max_access_size = 8,
2018     .valid.min_access_size = 1,
2019     .valid.max_access_size = 8,
2020     .valid.accepts = subpage_accepts,
2021     .endianness = DEVICE_NATIVE_ENDIAN,
2022 };
2023
2024 static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
2025                              uint16_t section)
2026 {
2027     int idx, eidx;
2028
2029     if (start >= TARGET_PAGE_SIZE || end >= TARGET_PAGE_SIZE)
2030         return -1;
2031     idx = SUBPAGE_IDX(start);
2032     eidx = SUBPAGE_IDX(end);
2033 #if defined(DEBUG_SUBPAGE)
2034     printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n",
2035            __func__, mmio, start, end, idx, eidx, section);
2036 #endif
2037     for (; idx <= eidx; idx++) {
2038         mmio->sub_section[idx] = section;
2039     }
2040
2041     return 0;
2042 }
2043
2044 static subpage_t *subpage_init(AddressSpace *as, hwaddr base)
2045 {
2046     subpage_t *mmio;
2047
2048     mmio = g_malloc0(sizeof(subpage_t));
2049
2050     mmio->as = as;
2051     mmio->base = base;
2052     memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
2053                           NULL, TARGET_PAGE_SIZE);
2054     mmio->iomem.subpage = true;
2055 #if defined(DEBUG_SUBPAGE)
2056     printf("%s: %p base " TARGET_FMT_plx " len %08x\n", __func__,
2057            mmio, base, TARGET_PAGE_SIZE);
2058 #endif
2059     subpage_register(mmio, 0, TARGET_PAGE_SIZE-1, PHYS_SECTION_UNASSIGNED);
2060
2061     return mmio;
2062 }
2063
2064 static uint16_t dummy_section(PhysPageMap *map, AddressSpace *as,
2065                               MemoryRegion *mr)
2066 {
2067     assert(as);
2068     MemoryRegionSection section = {
2069         .address_space = as,
2070         .mr = mr,
2071         .offset_within_address_space = 0,
2072         .offset_within_region = 0,
2073         .size = int128_2_64(),
2074     };
2075
2076     return phys_section_add(map, &section);
2077 }
2078
2079 MemoryRegion *iotlb_to_region(CPUState *cpu, hwaddr index)
2080 {
2081     AddressSpaceDispatch *d = atomic_rcu_read(&cpu->memory_dispatch);
2082     MemoryRegionSection *sections = d->map.sections;
2083
2084     return sections[index & ~TARGET_PAGE_MASK].mr;
2085 }
2086
2087 static void io_mem_init(void)
2088 {
2089     memory_region_init_io(&io_mem_rom, NULL, &unassigned_mem_ops, NULL, NULL, UINT64_MAX);
2090     memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
2091                           NULL, UINT64_MAX);
2092     memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
2093                           NULL, UINT64_MAX);
2094     memory_region_init_io(&io_mem_watch, NULL, &watch_mem_ops, NULL,
2095                           NULL, UINT64_MAX);
2096 }
2097
2098 static void mem_begin(MemoryListener *listener)
2099 {
2100     AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
2101     AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
2102     uint16_t n;
2103
2104     n = dummy_section(&d->map, as, &io_mem_unassigned);
2105     assert(n == PHYS_SECTION_UNASSIGNED);
2106     n = dummy_section(&d->map, as, &io_mem_notdirty);
2107     assert(n == PHYS_SECTION_NOTDIRTY);
2108     n = dummy_section(&d->map, as, &io_mem_rom);
2109     assert(n == PHYS_SECTION_ROM);
2110     n = dummy_section(&d->map, as, &io_mem_watch);
2111     assert(n == PHYS_SECTION_WATCH);
2112
2113     d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
2114     d->as = as;
2115     as->next_dispatch = d;
2116 }
2117
2118 static void address_space_dispatch_free(AddressSpaceDispatch *d)
2119 {
2120     phys_sections_free(&d->map);
2121     g_free(d);
2122 }
2123
2124 static void mem_commit(MemoryListener *listener)
2125 {
2126     AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
2127     AddressSpaceDispatch *cur = as->dispatch;
2128     AddressSpaceDispatch *next = as->next_dispatch;
2129
2130     phys_page_compact_all(next, next->map.nodes_nb);
2131
2132     atomic_rcu_set(&as->dispatch, next);
2133     if (cur) {
2134         call_rcu(cur, address_space_dispatch_free, rcu);
2135     }
2136 }
2137
2138 static void tcg_commit(MemoryListener *listener)
2139 {
2140     CPUState *cpu;
2141
2142     /* since each CPU stores ram addresses in its TLB cache, we must
2143        reset the modified entries */
2144     /* XXX: slow ! */
2145     CPU_FOREACH(cpu) {
2146         /* FIXME: Disentangle the cpu.h circular files deps so we can
2147            directly get the right CPU from listener.  */
2148         if (cpu->tcg_as_listener != listener) {
2149             continue;
2150         }
2151         cpu_reload_memory_map(cpu);
2152     }
2153 }
2154
2155 static void core_log_global_start(MemoryListener *listener)
2156 {
2157     cpu_physical_memory_set_dirty_tracking(true);
2158 }
2159
2160 static void core_log_global_stop(MemoryListener *listener)
2161 {
2162     cpu_physical_memory_set_dirty_tracking(false);
2163 }
2164
2165 static MemoryListener core_memory_listener = {
2166     .log_global_start = core_log_global_start,
2167     .log_global_stop = core_log_global_stop,
2168     .priority = 1,
2169 };
2170
2171 void address_space_init_dispatch(AddressSpace *as)
2172 {
2173     as->dispatch = NULL;
2174     as->dispatch_listener = (MemoryListener) {
2175         .begin = mem_begin,
2176         .commit = mem_commit,
2177         .region_add = mem_add,
2178         .region_nop = mem_add,
2179         .priority = 0,
2180     };
2181     memory_listener_register(&as->dispatch_listener, as);
2182 }
2183
2184 void address_space_unregister(AddressSpace *as)
2185 {
2186     memory_listener_unregister(&as->dispatch_listener);
2187 }
2188
2189 void address_space_destroy_dispatch(AddressSpace *as)
2190 {
2191     AddressSpaceDispatch *d = as->dispatch;
2192
2193     atomic_rcu_set(&as->dispatch, NULL);
2194     if (d) {
2195         call_rcu(d, address_space_dispatch_free, rcu);
2196     }
2197 }
2198
2199 static void memory_map_init(void)
2200 {
2201     system_memory = g_malloc(sizeof(*system_memory));
2202
2203     memory_region_init(system_memory, NULL, "system", UINT64_MAX);
2204     address_space_init(&address_space_memory, system_memory, "memory");
2205
2206     system_io = g_malloc(sizeof(*system_io));
2207     memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
2208                           65536);
2209     address_space_init(&address_space_io, system_io, "I/O");
2210
2211     memory_listener_register(&core_memory_listener, &address_space_memory);
2212 }
2213
2214 MemoryRegion *get_system_memory(void)
2215 {
2216     return system_memory;
2217 }
2218
2219 MemoryRegion *get_system_io(void)
2220 {
2221     return system_io;
2222 }
2223
2224 #endif /* !defined(CONFIG_USER_ONLY) */
2225
2226 /* physical memory access (slow version, mainly for debug) */
2227 #if defined(CONFIG_USER_ONLY)
2228 int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
2229                         uint8_t *buf, int len, int is_write)
2230 {
2231     int l, flags;
2232     target_ulong page;
2233     void * p;
2234
2235     while (len > 0) {
2236         page = addr & TARGET_PAGE_MASK;
2237         l = (page + TARGET_PAGE_SIZE) - addr;
2238         if (l > len)
2239             l = len;
2240         flags = page_get_flags(page);
2241         if (!(flags & PAGE_VALID))
2242             return -1;
2243         if (is_write) {
2244             if (!(flags & PAGE_WRITE))
2245                 return -1;
2246             /* XXX: this code should not depend on lock_user */
2247             if (!(p = lock_user(VERIFY_WRITE, addr, l, 0)))
2248                 return -1;
2249             memcpy(p, buf, l);
2250             unlock_user(p, addr, l);
2251         } else {
2252             if (!(flags & PAGE_READ))
2253                 return -1;
2254             /* XXX: this code should not depend on lock_user */
2255             if (!(p = lock_user(VERIFY_READ, addr, l, 1)))
2256                 return -1;
2257             memcpy(buf, p, l);
2258             unlock_user(p, addr, 0);
2259         }
2260         len -= l;
2261         buf += l;
2262         addr += l;
2263     }
2264     return 0;
2265 }
2266
2267 #else
2268
2269 static void invalidate_and_set_dirty(hwaddr addr,
2270                                      hwaddr length)
2271 {
2272     if (cpu_physical_memory_range_includes_clean(addr, length)) {
2273         tb_invalidate_phys_range(addr, addr + length, 0);
2274         cpu_physical_memory_set_dirty_range_nocode(addr, length);
2275     }
2276     xen_modified_memory(addr, length);
2277 }
2278
2279 static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
2280 {
2281     unsigned access_size_max = mr->ops->valid.max_access_size;
2282
2283     /* Regions are assumed to support 1-4 byte accesses unless
2284        otherwise specified.  */
2285     if (access_size_max == 0) {
2286         access_size_max = 4;
2287     }
2288
2289     /* Bound the maximum access by the alignment of the address.  */
2290     if (!mr->ops->impl.unaligned) {
2291         unsigned align_size_max = addr & -addr;
2292         if (align_size_max != 0 && align_size_max < access_size_max) {
2293             access_size_max = align_size_max;
2294         }
2295     }
2296
2297     /* Don't attempt accesses larger than the maximum.  */
2298     if (l > access_size_max) {
2299         l = access_size_max;
2300     }
2301     if (l & (l - 1)) {
2302         l = 1 << (qemu_fls(l) - 1);
2303     }
2304
2305     return l;
2306 }
2307
2308 bool address_space_rw(AddressSpace *as, hwaddr addr, uint8_t *buf,
2309                       int len, bool is_write)
2310 {
2311     hwaddr l;
2312     uint8_t *ptr;
2313     uint64_t val;
2314     hwaddr addr1;
2315     MemoryRegion *mr;
2316     bool error = false;
2317
2318     while (len > 0) {
2319         l = len;
2320         mr = address_space_translate(as, addr, &addr1, &l, is_write);
2321
2322         if (is_write) {
2323             if (!memory_access_is_direct(mr, is_write)) {
2324                 l = memory_access_size(mr, l, addr1);
2325                 /* XXX: could force current_cpu to NULL to avoid
2326                    potential bugs */
2327                 switch (l) {
2328                 case 8:
2329                     /* 64 bit write access */
2330                     val = ldq_p(buf);
2331                     error |= io_mem_write(mr, addr1, val, 8);
2332                     break;
2333                 case 4:
2334                     /* 32 bit write access */
2335                     val = ldl_p(buf);
2336                     error |= io_mem_write(mr, addr1, val, 4);
2337                     break;
2338                 case 2:
2339                     /* 16 bit write access */
2340                     val = lduw_p(buf);
2341                     error |= io_mem_write(mr, addr1, val, 2);
2342                     break;
2343                 case 1:
2344                     /* 8 bit write access */
2345                     val = ldub_p(buf);
2346                     error |= io_mem_write(mr, addr1, val, 1);
2347                     break;
2348                 default:
2349                     abort();
2350                 }
2351             } else {
2352                 addr1 += memory_region_get_ram_addr(mr);
2353                 /* RAM case */
2354                 ptr = qemu_get_ram_ptr(addr1);
2355                 memcpy(ptr, buf, l);
2356                 invalidate_and_set_dirty(addr1, l);
2357             }
2358         } else {
2359             if (!memory_access_is_direct(mr, is_write)) {
2360                 /* I/O case */
2361                 l = memory_access_size(mr, l, addr1);
2362                 switch (l) {
2363                 case 8:
2364                     /* 64 bit read access */
2365                     error |= io_mem_read(mr, addr1, &val, 8);
2366                     stq_p(buf, val);
2367                     break;
2368                 case 4:
2369                     /* 32 bit read access */
2370                     error |= io_mem_read(mr, addr1, &val, 4);
2371                     stl_p(buf, val);
2372                     break;
2373                 case 2:
2374                     /* 16 bit read access */
2375                     error |= io_mem_read(mr, addr1, &val, 2);
2376                     stw_p(buf, val);
2377                     break;
2378                 case 1:
2379                     /* 8 bit read access */
2380                     error |= io_mem_read(mr, addr1, &val, 1);
2381                     stb_p(buf, val);
2382                     break;
2383                 default:
2384                     abort();
2385                 }
2386             } else {
2387                 /* RAM case */
2388                 ptr = qemu_get_ram_ptr(mr->ram_addr + addr1);
2389                 memcpy(buf, ptr, l);
2390             }
2391         }
2392         len -= l;
2393         buf += l;
2394         addr += l;
2395     }
2396
2397     return error;
2398 }
2399
2400 bool address_space_write(AddressSpace *as, hwaddr addr,
2401                          const uint8_t *buf, int len)
2402 {
2403     return address_space_rw(as, addr, (uint8_t *)buf, len, true);
2404 }
2405
2406 bool address_space_read(AddressSpace *as, hwaddr addr, uint8_t *buf, int len)
2407 {
2408     return address_space_rw(as, addr, buf, len, false);
2409 }
2410
2411
2412 void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
2413                             int len, int is_write)
2414 {
2415     address_space_rw(&address_space_memory, addr, buf, len, is_write);
2416 }
2417
2418 enum write_rom_type {
2419     WRITE_DATA,
2420     FLUSH_CACHE,
2421 };
2422
2423 static inline void cpu_physical_memory_write_rom_internal(AddressSpace *as,
2424     hwaddr addr, const uint8_t *buf, int len, enum write_rom_type type)
2425 {
2426     hwaddr l;
2427     uint8_t *ptr;
2428     hwaddr addr1;
2429     MemoryRegion *mr;
2430
2431     while (len > 0) {
2432         l = len;
2433         mr = address_space_translate(as, addr, &addr1, &l, true);
2434
2435         if (!(memory_region_is_ram(mr) ||
2436               memory_region_is_romd(mr))) {
2437             /* do nothing */
2438         } else {
2439             addr1 += memory_region_get_ram_addr(mr);
2440             /* ROM/RAM case */
2441             ptr = qemu_get_ram_ptr(addr1);
2442             switch (type) {
2443             case WRITE_DATA:
2444                 memcpy(ptr, buf, l);
2445                 invalidate_and_set_dirty(addr1, l);
2446                 break;
2447             case FLUSH_CACHE:
2448                 flush_icache_range((uintptr_t)ptr, (uintptr_t)ptr + l);
2449                 break;
2450             }
2451         }
2452         len -= l;
2453         buf += l;
2454         addr += l;
2455     }
2456 }
2457
2458 /* used for ROM loading : can write in RAM and ROM */
2459 void cpu_physical_memory_write_rom(AddressSpace *as, hwaddr addr,
2460                                    const uint8_t *buf, int len)
2461 {
2462     cpu_physical_memory_write_rom_internal(as, addr, buf, len, WRITE_DATA);
2463 }
2464
2465 void cpu_flush_icache_range(hwaddr start, int len)
2466 {
2467     /*
2468      * This function should do the same thing as an icache flush that was
2469      * triggered from within the guest. For TCG we are always cache coherent,
2470      * so there is no need to flush anything. For KVM / Xen we need to flush
2471      * the host's instruction cache at least.
2472      */
2473     if (tcg_enabled()) {
2474         return;
2475     }
2476
2477     cpu_physical_memory_write_rom_internal(&address_space_memory,
2478                                            start, NULL, len, FLUSH_CACHE);
2479 }
2480
2481 typedef struct {
2482     MemoryRegion *mr;
2483     void *buffer;
2484     hwaddr addr;
2485     hwaddr len;
2486 } BounceBuffer;
2487
2488 static BounceBuffer bounce;
2489
2490 typedef struct MapClient {
2491     void *opaque;
2492     void (*callback)(void *opaque);
2493     QLIST_ENTRY(MapClient) link;
2494 } MapClient;
2495
2496 static QLIST_HEAD(map_client_list, MapClient) map_client_list
2497     = QLIST_HEAD_INITIALIZER(map_client_list);
2498
2499 void *cpu_register_map_client(void *opaque, void (*callback)(void *opaque))
2500 {
2501     MapClient *client = g_malloc(sizeof(*client));
2502
2503     client->opaque = opaque;
2504     client->callback = callback;
2505     QLIST_INSERT_HEAD(&map_client_list, client, link);
2506     return client;
2507 }
2508
2509 static void cpu_unregister_map_client(void *_client)
2510 {
2511     MapClient *client = (MapClient *)_client;
2512
2513     QLIST_REMOVE(client, link);
2514     g_free(client);
2515 }
2516
2517 static void cpu_notify_map_clients(void)
2518 {
2519     MapClient *client;
2520
2521     while (!QLIST_EMPTY(&map_client_list)) {
2522         client = QLIST_FIRST(&map_client_list);
2523         client->callback(client->opaque);
2524         cpu_unregister_map_client(client);
2525     }
2526 }
2527
2528 bool address_space_access_valid(AddressSpace *as, hwaddr addr, int len, bool is_write)
2529 {
2530     MemoryRegion *mr;
2531     hwaddr l, xlat;
2532
2533     while (len > 0) {
2534         l = len;
2535         mr = address_space_translate(as, addr, &xlat, &l, is_write);
2536         if (!memory_access_is_direct(mr, is_write)) {
2537             l = memory_access_size(mr, l, addr);
2538             if (!memory_region_access_valid(mr, xlat, l, is_write)) {
2539                 return false;
2540             }
2541         }
2542
2543         len -= l;
2544         addr += l;
2545     }
2546     return true;
2547 }
2548
2549 /* Map a physical memory region into a host virtual address.
2550  * May map a subset of the requested range, given by and returned in *plen.
2551  * May return NULL if resources needed to perform the mapping are exhausted.
2552  * Use only for reads OR writes - not for read-modify-write operations.
2553  * Use cpu_register_map_client() to know when retrying the map operation is
2554  * likely to succeed.
2555  */
2556 void *address_space_map(AddressSpace *as,
2557                         hwaddr addr,
2558                         hwaddr *plen,
2559                         bool is_write)
2560 {
2561     hwaddr len = *plen;
2562     hwaddr done = 0;
2563     hwaddr l, xlat, base;
2564     MemoryRegion *mr, *this_mr;
2565     ram_addr_t raddr;
2566
2567     if (len == 0) {
2568         return NULL;
2569     }
2570
2571     l = len;
2572     mr = address_space_translate(as, addr, &xlat, &l, is_write);
2573     if (!memory_access_is_direct(mr, is_write)) {
2574         if (bounce.buffer) {
2575             return NULL;
2576         }
2577         /* Avoid unbounded allocations */
2578         l = MIN(l, TARGET_PAGE_SIZE);
2579         bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
2580         bounce.addr = addr;
2581         bounce.len = l;
2582
2583         memory_region_ref(mr);
2584         bounce.mr = mr;
2585         if (!is_write) {
2586             address_space_read(as, addr, bounce.buffer, l);
2587         }
2588
2589         *plen = l;
2590         return bounce.buffer;
2591     }
2592
2593     base = xlat;
2594     raddr = memory_region_get_ram_addr(mr);
2595
2596     for (;;) {
2597         len -= l;
2598         addr += l;
2599         done += l;
2600         if (len == 0) {
2601             break;
2602         }
2603
2604         l = len;
2605         this_mr = address_space_translate(as, addr, &xlat, &l, is_write);
2606         if (this_mr != mr || xlat != base + done) {
2607             break;
2608         }
2609     }
2610
2611     memory_region_ref(mr);
2612     *plen = done;
2613     return qemu_ram_ptr_length(raddr + base, plen);
2614 }
2615
2616 /* Unmaps a memory region previously mapped by address_space_map().
2617  * Will also mark the memory as dirty if is_write == 1.  access_len gives
2618  * the amount of memory that was actually read or written by the caller.
2619  */
2620 void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
2621                          int is_write, hwaddr access_len)
2622 {
2623     if (buffer != bounce.buffer) {
2624         MemoryRegion *mr;
2625         ram_addr_t addr1;
2626
2627         mr = qemu_ram_addr_from_host(buffer, &addr1);
2628         assert(mr != NULL);
2629         if (is_write) {
2630             invalidate_and_set_dirty(addr1, access_len);
2631         }
2632         if (xen_enabled()) {
2633             xen_invalidate_map_cache_entry(buffer);
2634         }
2635         memory_region_unref(mr);
2636         return;
2637     }
2638     if (is_write) {
2639         address_space_write(as, bounce.addr, bounce.buffer, access_len);
2640     }
2641     qemu_vfree(bounce.buffer);
2642     bounce.buffer = NULL;
2643     memory_region_unref(bounce.mr);
2644     cpu_notify_map_clients();
2645 }
2646
2647 void *cpu_physical_memory_map(hwaddr addr,
2648                               hwaddr *plen,
2649                               int is_write)
2650 {
2651     return address_space_map(&address_space_memory, addr, plen, is_write);
2652 }
2653
2654 void cpu_physical_memory_unmap(void *buffer, hwaddr len,
2655                                int is_write, hwaddr access_len)
2656 {
2657     return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
2658 }
2659
2660 /* warning: addr must be aligned */
2661 static inline uint32_t ldl_phys_internal(AddressSpace *as, hwaddr addr,
2662                                          enum device_endian endian)
2663 {
2664     uint8_t *ptr;
2665     uint64_t val;
2666     MemoryRegion *mr;
2667     hwaddr l = 4;
2668     hwaddr addr1;
2669
2670     mr = address_space_translate(as, addr, &addr1, &l, false);
2671     if (l < 4 || !memory_access_is_direct(mr, false)) {
2672         /* I/O case */
2673         io_mem_read(mr, addr1, &val, 4);
2674 #if defined(TARGET_WORDS_BIGENDIAN)
2675         if (endian == DEVICE_LITTLE_ENDIAN) {
2676             val = bswap32(val);
2677         }
2678 #else
2679         if (endian == DEVICE_BIG_ENDIAN) {
2680             val = bswap32(val);
2681         }
2682 #endif
2683     } else {
2684         /* RAM case */
2685         ptr = qemu_get_ram_ptr((memory_region_get_ram_addr(mr)
2686                                 & TARGET_PAGE_MASK)
2687                                + addr1);
2688         switch (endian) {
2689         case DEVICE_LITTLE_ENDIAN:
2690             val = ldl_le_p(ptr);
2691             break;
2692         case DEVICE_BIG_ENDIAN:
2693             val = ldl_be_p(ptr);
2694             break;
2695         default:
2696             val = ldl_p(ptr);
2697             break;
2698         }
2699     }
2700     return val;
2701 }
2702
2703 uint32_t ldl_phys(AddressSpace *as, hwaddr addr)
2704 {
2705     return ldl_phys_internal(as, addr, DEVICE_NATIVE_ENDIAN);
2706 }
2707
2708 uint32_t ldl_le_phys(AddressSpace *as, hwaddr addr)
2709 {
2710     return ldl_phys_internal(as, addr, DEVICE_LITTLE_ENDIAN);
2711 }
2712
2713 uint32_t ldl_be_phys(AddressSpace *as, hwaddr addr)
2714 {
2715     return ldl_phys_internal(as, addr, DEVICE_BIG_ENDIAN);
2716 }
2717
2718 /* warning: addr must be aligned */
2719 static inline uint64_t ldq_phys_internal(AddressSpace *as, hwaddr addr,
2720                                          enum device_endian endian)
2721 {
2722     uint8_t *ptr;
2723     uint64_t val;
2724     MemoryRegion *mr;
2725     hwaddr l = 8;
2726     hwaddr addr1;
2727
2728     mr = address_space_translate(as, addr, &addr1, &l,
2729                                  false);
2730     if (l < 8 || !memory_access_is_direct(mr, false)) {
2731         /* I/O case */
2732         io_mem_read(mr, addr1, &val, 8);
2733 #if defined(TARGET_WORDS_BIGENDIAN)
2734         if (endian == DEVICE_LITTLE_ENDIAN) {
2735             val = bswap64(val);
2736         }
2737 #else
2738         if (endian == DEVICE_BIG_ENDIAN) {
2739             val = bswap64(val);
2740         }
2741 #endif
2742     } else {
2743         /* RAM case */
2744         ptr = qemu_get_ram_ptr((memory_region_get_ram_addr(mr)
2745                                 & TARGET_PAGE_MASK)
2746                                + addr1);
2747         switch (endian) {
2748         case DEVICE_LITTLE_ENDIAN:
2749             val = ldq_le_p(ptr);
2750             break;
2751         case DEVICE_BIG_ENDIAN:
2752             val = ldq_be_p(ptr);
2753             break;
2754         default:
2755             val = ldq_p(ptr);
2756             break;
2757         }
2758     }
2759     return val;
2760 }
2761
2762 uint64_t ldq_phys(AddressSpace *as, hwaddr addr)
2763 {
2764     return ldq_phys_internal(as, addr, DEVICE_NATIVE_ENDIAN);
2765 }
2766
2767 uint64_t ldq_le_phys(AddressSpace *as, hwaddr addr)
2768 {
2769     return ldq_phys_internal(as, addr, DEVICE_LITTLE_ENDIAN);
2770 }
2771
2772 uint64_t ldq_be_phys(AddressSpace *as, hwaddr addr)
2773 {
2774     return ldq_phys_internal(as, addr, DEVICE_BIG_ENDIAN);
2775 }
2776
2777 /* XXX: optimize */
2778 uint32_t ldub_phys(AddressSpace *as, hwaddr addr)
2779 {
2780     uint8_t val;
2781     address_space_rw(as, addr, &val, 1, 0);
2782     return val;
2783 }
2784
2785 /* warning: addr must be aligned */
2786 static inline uint32_t lduw_phys_internal(AddressSpace *as, hwaddr addr,
2787                                           enum device_endian endian)
2788 {
2789     uint8_t *ptr;
2790     uint64_t val;
2791     MemoryRegion *mr;
2792     hwaddr l = 2;
2793     hwaddr addr1;
2794
2795     mr = address_space_translate(as, addr, &addr1, &l,
2796                                  false);
2797     if (l < 2 || !memory_access_is_direct(mr, false)) {
2798         /* I/O case */
2799         io_mem_read(mr, addr1, &val, 2);
2800 #if defined(TARGET_WORDS_BIGENDIAN)
2801         if (endian == DEVICE_LITTLE_ENDIAN) {
2802             val = bswap16(val);
2803         }
2804 #else
2805         if (endian == DEVICE_BIG_ENDIAN) {
2806             val = bswap16(val);
2807         }
2808 #endif
2809     } else {
2810         /* RAM case */
2811         ptr = qemu_get_ram_ptr((memory_region_get_ram_addr(mr)
2812                                 & TARGET_PAGE_MASK)
2813                                + addr1);
2814         switch (endian) {
2815         case DEVICE_LITTLE_ENDIAN:
2816             val = lduw_le_p(ptr);
2817             break;
2818         case DEVICE_BIG_ENDIAN:
2819             val = lduw_be_p(ptr);
2820             break;
2821         default:
2822             val = lduw_p(ptr);
2823             break;
2824         }
2825     }
2826     return val;
2827 }
2828
2829 uint32_t lduw_phys(AddressSpace *as, hwaddr addr)
2830 {
2831     return lduw_phys_internal(as, addr, DEVICE_NATIVE_ENDIAN);
2832 }
2833
2834 uint32_t lduw_le_phys(AddressSpace *as, hwaddr addr)
2835 {
2836     return lduw_phys_internal(as, addr, DEVICE_LITTLE_ENDIAN);
2837 }
2838
2839 uint32_t lduw_be_phys(AddressSpace *as, hwaddr addr)
2840 {
2841     return lduw_phys_internal(as, addr, DEVICE_BIG_ENDIAN);
2842 }
2843
2844 /* warning: addr must be aligned. The ram page is not masked as dirty
2845    and the code inside is not invalidated. It is useful if the dirty
2846    bits are used to track modified PTEs */
2847 void stl_phys_notdirty(AddressSpace *as, hwaddr addr, uint32_t val)
2848 {
2849     uint8_t *ptr;
2850     MemoryRegion *mr;
2851     hwaddr l = 4;
2852     hwaddr addr1;
2853
2854     mr = address_space_translate(as, addr, &addr1, &l,
2855                                  true);
2856     if (l < 4 || !memory_access_is_direct(mr, true)) {
2857         io_mem_write(mr, addr1, val, 4);
2858     } else {
2859         addr1 += memory_region_get_ram_addr(mr) & TARGET_PAGE_MASK;
2860         ptr = qemu_get_ram_ptr(addr1);
2861         stl_p(ptr, val);
2862
2863         if (unlikely(in_migration)) {
2864             if (cpu_physical_memory_is_clean(addr1)) {
2865                 /* invalidate code */
2866                 tb_invalidate_phys_page_range(addr1, addr1 + 4, 0);
2867                 /* set dirty bit */
2868                 cpu_physical_memory_set_dirty_range_nocode(addr1, 4);
2869             }
2870         }
2871     }
2872 }
2873
2874 /* warning: addr must be aligned */
2875 static inline void stl_phys_internal(AddressSpace *as,
2876                                      hwaddr addr, uint32_t val,
2877                                      enum device_endian endian)
2878 {
2879     uint8_t *ptr;
2880     MemoryRegion *mr;
2881     hwaddr l = 4;
2882     hwaddr addr1;
2883
2884     mr = address_space_translate(as, addr, &addr1, &l,
2885                                  true);
2886     if (l < 4 || !memory_access_is_direct(mr, true)) {
2887 #if defined(TARGET_WORDS_BIGENDIAN)
2888         if (endian == DEVICE_LITTLE_ENDIAN) {
2889             val = bswap32(val);
2890         }
2891 #else
2892         if (endian == DEVICE_BIG_ENDIAN) {
2893             val = bswap32(val);
2894         }
2895 #endif
2896         io_mem_write(mr, addr1, val, 4);
2897     } else {
2898         /* RAM case */
2899         addr1 += memory_region_get_ram_addr(mr) & TARGET_PAGE_MASK;
2900         ptr = qemu_get_ram_ptr(addr1);
2901         switch (endian) {
2902         case DEVICE_LITTLE_ENDIAN:
2903             stl_le_p(ptr, val);
2904             break;
2905         case DEVICE_BIG_ENDIAN:
2906             stl_be_p(ptr, val);
2907             break;
2908         default:
2909             stl_p(ptr, val);
2910             break;
2911         }
2912         invalidate_and_set_dirty(addr1, 4);
2913     }
2914 }
2915
2916 void stl_phys(AddressSpace *as, hwaddr addr, uint32_t val)
2917 {
2918     stl_phys_internal(as, addr, val, DEVICE_NATIVE_ENDIAN);
2919 }
2920
2921 void stl_le_phys(AddressSpace *as, hwaddr addr, uint32_t val)
2922 {
2923     stl_phys_internal(as, addr, val, DEVICE_LITTLE_ENDIAN);
2924 }
2925
2926 void stl_be_phys(AddressSpace *as, hwaddr addr, uint32_t val)
2927 {
2928     stl_phys_internal(as, addr, val, DEVICE_BIG_ENDIAN);
2929 }
2930
2931 /* XXX: optimize */
2932 void stb_phys(AddressSpace *as, hwaddr addr, uint32_t val)
2933 {
2934     uint8_t v = val;
2935     address_space_rw(as, addr, &v, 1, 1);
2936 }
2937
2938 /* warning: addr must be aligned */
2939 static inline void stw_phys_internal(AddressSpace *as,
2940                                      hwaddr addr, uint32_t val,
2941                                      enum device_endian endian)
2942 {
2943     uint8_t *ptr;
2944     MemoryRegion *mr;
2945     hwaddr l = 2;
2946     hwaddr addr1;
2947
2948     mr = address_space_translate(as, addr, &addr1, &l, true);
2949     if (l < 2 || !memory_access_is_direct(mr, true)) {
2950 #if defined(TARGET_WORDS_BIGENDIAN)
2951         if (endian == DEVICE_LITTLE_ENDIAN) {
2952             val = bswap16(val);
2953         }
2954 #else
2955         if (endian == DEVICE_BIG_ENDIAN) {
2956             val = bswap16(val);
2957         }
2958 #endif
2959         io_mem_write(mr, addr1, val, 2);
2960     } else {
2961         /* RAM case */
2962         addr1 += memory_region_get_ram_addr(mr) & TARGET_PAGE_MASK;
2963         ptr = qemu_get_ram_ptr(addr1);
2964         switch (endian) {
2965         case DEVICE_LITTLE_ENDIAN:
2966             stw_le_p(ptr, val);
2967             break;
2968         case DEVICE_BIG_ENDIAN:
2969             stw_be_p(ptr, val);
2970             break;
2971         default:
2972             stw_p(ptr, val);
2973             break;
2974         }
2975         invalidate_and_set_dirty(addr1, 2);
2976     }
2977 }
2978
2979 void stw_phys(AddressSpace *as, hwaddr addr, uint32_t val)
2980 {
2981     stw_phys_internal(as, addr, val, DEVICE_NATIVE_ENDIAN);
2982 }
2983
2984 void stw_le_phys(AddressSpace *as, hwaddr addr, uint32_t val)
2985 {
2986     stw_phys_internal(as, addr, val, DEVICE_LITTLE_ENDIAN);
2987 }
2988
2989 void stw_be_phys(AddressSpace *as, hwaddr addr, uint32_t val)
2990 {
2991     stw_phys_internal(as, addr, val, DEVICE_BIG_ENDIAN);
2992 }
2993
2994 /* XXX: optimize */
2995 void stq_phys(AddressSpace *as, hwaddr addr, uint64_t val)
2996 {
2997     val = tswap64(val);
2998     address_space_rw(as, addr, (void *) &val, 8, 1);
2999 }
3000
3001 void stq_le_phys(AddressSpace *as, hwaddr addr, uint64_t val)
3002 {
3003     val = cpu_to_le64(val);
3004     address_space_rw(as, addr, (void *) &val, 8, 1);
3005 }
3006
3007 void stq_be_phys(AddressSpace *as, hwaddr addr, uint64_t val)
3008 {
3009     val = cpu_to_be64(val);
3010     address_space_rw(as, addr, (void *) &val, 8, 1);
3011 }
3012
3013 /* virtual memory access for debug (includes writing to ROM) */
3014 int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
3015                         uint8_t *buf, int len, int is_write)
3016 {
3017     int l;
3018     hwaddr phys_addr;
3019     target_ulong page;
3020
3021     while (len > 0) {
3022         page = addr & TARGET_PAGE_MASK;
3023         phys_addr = cpu_get_phys_page_debug(cpu, page);
3024         /* if no physical page mapped, return an error */
3025         if (phys_addr == -1)
3026             return -1;
3027         l = (page + TARGET_PAGE_SIZE) - addr;
3028         if (l > len)
3029             l = len;
3030         phys_addr += (addr & ~TARGET_PAGE_MASK);
3031         if (is_write) {
3032             cpu_physical_memory_write_rom(cpu->as, phys_addr, buf, l);
3033         } else {
3034             address_space_rw(cpu->as, phys_addr, buf, l, 0);
3035         }
3036         len -= l;
3037         buf += l;
3038         addr += l;
3039     }
3040     return 0;
3041 }
3042 #endif
3043
3044 /*
3045  * A helper function for the _utterly broken_ virtio device model to find out if
3046  * it's running on a big endian machine. Don't do this at home kids!
3047  */
3048 bool target_words_bigendian(void);
3049 bool target_words_bigendian(void)
3050 {
3051 #if defined(TARGET_WORDS_BIGENDIAN)
3052     return true;
3053 #else
3054     return false;
3055 #endif
3056 }
3057
3058 #ifndef CONFIG_USER_ONLY
3059 bool cpu_physical_memory_is_io(hwaddr phys_addr)
3060 {
3061     MemoryRegion*mr;
3062     hwaddr l = 1;
3063
3064     mr = address_space_translate(&address_space_memory,
3065                                  phys_addr, &phys_addr, &l, false);
3066
3067     return !(memory_region_is_ram(mr) ||
3068              memory_region_is_romd(mr));
3069 }
3070
3071 void qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
3072 {
3073     RAMBlock *block;
3074
3075     rcu_read_lock();
3076     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
3077         func(block->host, block->offset, block->used_length, opaque);
3078     }
3079     rcu_read_unlock();
3080 }
3081 #endif