exec.c

   1 /*
   2  *  Virtual page mapping
   3  *
   4  *  Copyright (c) 2003 Fabrice Bellard
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19 #include "config.h"
  20 #ifndef _WIN32
  21 #include <sys/types.h>
  22 #include <sys/mman.h>
  23 #endif
  24
  25 #include "qemu-common.h"
  26 #include "cpu.h"
  27 #include "tcg.h"
  28 #include "hw/hw.h"
  29 #if !defined(CONFIG_USER_ONLY)
  30 #include "hw/boards.h"
  31 #endif
  32 #include "hw/qdev.h"
  33 #include "qemu/osdep.h"
  34 #include "sysemu/kvm.h"
  35 #include "sysemu/sysemu.h"
  36 #include "hw/xen/xen.h"
  37 #include "qemu/timer.h"
  38 #include "qemu/config-file.h"
  39 #include "qemu/error-report.h"
  40 #include "exec/memory.h"
  41 #include "sysemu/dma.h"
  42 #include "exec/address-spaces.h"
  43 #if defined(CONFIG_USER_ONLY)
  44 #include <qemu.h>
  45 #else /* !CONFIG_USER_ONLY */
  46 #include "sysemu/xen-mapcache.h"
  47 #include "trace.h"
  48 #endif
  49 #include "exec/cpu-all.h"
  50 #include "qemu/rcu_queue.h"
  51 #include "exec/cputlb.h"
  52 #include "translate-all.h"
  53
  54 #include "exec/memory-internal.h"
  55 #include "exec/ram_addr.h"
  56
  57 #include "qemu/range.h"
  58
  59 //#define DEBUG_SUBPAGE
  60
  61 #if !defined(CONFIG_USER_ONLY)
  62 static bool in_migration;
  63
  64 /* ram_list is read under rcu_read_lock()/rcu_read_unlock().  Writes
  65  * are protected by the ramlist lock.
  66  */
  67 RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
  68
  69 static MemoryRegion *system_memory;
  70 static MemoryRegion *system_io;
  71
  72 AddressSpace address_space_io;
  73 AddressSpace address_space_memory;
  74
  75 MemoryRegion io_mem_rom, io_mem_notdirty;
  76 static MemoryRegion io_mem_unassigned;
  77
  78 /* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */
  79 #define RAM_PREALLOC   (1 << 0)
  80
  81 /* RAM is mmap-ed with MAP_SHARED */
  82 #define RAM_SHARED     (1 << 1)
  83
  84 /* Only a portion of RAM (used_length) is actually used, and migrated.
  85  * This used_length size can change across reboots.
  86  */
  87 #define RAM_RESIZEABLE (1 << 2)
  88
  89 #endif
  90
  91 struct CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
  92 /* current CPU in the current thread. It is only valid inside
  93    cpu_exec() */
  94 DEFINE_TLS(CPUState *, current_cpu);
  95 /* 0 = Do not count executed instructions.
  96    1 = Precise instruction counting.
  97    2 = Adaptive rate instruction counting.  */
  98 int use_icount;
  99
 100 #if !defined(CONFIG_USER_ONLY)
 101
 102 typedef struct PhysPageEntry PhysPageEntry;
 103
 104 struct PhysPageEntry {
 105     /* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
 106     uint32_t skip : 6;
 107      /* index into phys_sections (!skip) or phys_map_nodes (skip) */
 108     uint32_t ptr : 26;
 109 };
 110
 111 #define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)
 112
 113 /* Size of the L2 (and L3, etc) page tables.  */
 114 #define ADDR_SPACE_BITS 64
 115
 116 #define P_L2_BITS 9
 117 #define P_L2_SIZE (1 << P_L2_BITS)
 118
 119 #define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
 120
 121 typedef PhysPageEntry Node[P_L2_SIZE];
 122
 123 typedef struct PhysPageMap {
 124     struct rcu_head rcu;
 125
 126     unsigned sections_nb;
 127     unsigned sections_nb_alloc;
 128     unsigned nodes_nb;
 129     unsigned nodes_nb_alloc;
 130     Node *nodes;
 131     MemoryRegionSection *sections;
 132 } PhysPageMap;
 133
 134 struct AddressSpaceDispatch {
 135     struct rcu_head rcu;
 136
 137     /* This is a multi-level map on the physical address space.
 138      * The bottom level has pointers to MemoryRegionSections.
 139      */
 140     PhysPageEntry phys_map;
 141     PhysPageMap map;
 142     AddressSpace *as;
 143 };
 144
 145 #define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
 146 typedef struct subpage_t {
 147     MemoryRegion iomem;
 148     AddressSpace *as;
 149     hwaddr base;
 150     uint16_t sub_section[TARGET_PAGE_SIZE];
 151 } subpage_t;
 152
 153 #define PHYS_SECTION_UNASSIGNED 0
 154 #define PHYS_SECTION_NOTDIRTY 1
 155 #define PHYS_SECTION_ROM 2
 156 #define PHYS_SECTION_WATCH 3
 157
 158 static void io_mem_init(void);
 159 static void memory_map_init(void);
 160 static void tcg_commit(MemoryListener *listener);
 161
 162 static MemoryRegion io_mem_watch;
 163 #endif
 164
 165 #if !defined(CONFIG_USER_ONLY)
 166
 167 static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
 168 {
 169     if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
 170         map->nodes_nb_alloc = MAX(map->nodes_nb_alloc * 2, 16);
 171         map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, map->nodes_nb + nodes);
 172         map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
 173     }
 174 }
 175
 176 static uint32_t phys_map_node_alloc(PhysPageMap *map)
 177 {
 178     unsigned i;
 179     uint32_t ret;
 180
 181     ret = map->nodes_nb++;
 182     assert(ret != PHYS_MAP_NODE_NIL);
 183     assert(ret != map->nodes_nb_alloc);
 184     for (i = 0; i < P_L2_SIZE; ++i) {
 185         map->nodes[ret][i].skip = 1;
 186         map->nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
 187     }
 188     return ret;
 189 }
 190
 191 static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp,
 192                                 hwaddr *index, hwaddr *nb, uint16_t leaf,
 193                                 int level)
 194 {
 195     PhysPageEntry *p;
 196     int i;
 197     hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
 198
 199     if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
 200         lp->ptr = phys_map_node_alloc(map);
 201         p = map->nodes[lp->ptr];
 202         if (level == 0) {
 203             for (i = 0; i < P_L2_SIZE; i++) {
 204                 p[i].skip = 0;
 205                 p[i].ptr = PHYS_SECTION_UNASSIGNED;
 206             }
 207         }
 208     } else {
 209         p = map->nodes[lp->ptr];
 210     }
 211     lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)];
 212
 213     while (*nb && lp < &p[P_L2_SIZE]) {
 214         if ((*index & (step - 1)) == 0 && *nb >= step) {
 215             lp->skip = 0;
 216             lp->ptr = leaf;
 217             *index += step;
 218             *nb -= step;
 219         } else {
 220             phys_page_set_level(map, lp, index, nb, leaf, level - 1);
 221         }
 222         ++lp;
 223     }
 224 }
 225
 226 static void phys_page_set(AddressSpaceDispatch *d,
 227                           hwaddr index, hwaddr nb,
 228                           uint16_t leaf)
 229 {
 230     /* Wildly overreserve - it doesn't matter much. */
 231     phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
 232
 233     phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
 234 }
 235
 236 /* Compact a non leaf page entry. Simply detect that the entry has a single child,
 237  * and update our entry so we can skip it and go directly to the destination.
 238  */
 239 static void phys_page_compact(PhysPageEntry *lp, Node *nodes, unsigned long *compacted)
 240 {
 241     unsigned valid_ptr = P_L2_SIZE;
 242     int valid = 0;
 243     PhysPageEntry *p;
 244     int i;
 245
 246     if (lp->ptr == PHYS_MAP_NODE_NIL) {
 247         return;
 248     }
 249
 250     p = nodes[lp->ptr];
 251     for (i = 0; i < P_L2_SIZE; i++) {
 252         if (p[i].ptr == PHYS_MAP_NODE_NIL) {
 253             continue;
 254         }
 255
 256         valid_ptr = i;
 257         valid++;
 258         if (p[i].skip) {
 259             phys_page_compact(&p[i], nodes, compacted);
 260         }
 261     }
 262
 263     /* We can only compress if there's only one child. */
 264     if (valid != 1) {
 265         return;
 266     }
 267
 268     assert(valid_ptr < P_L2_SIZE);
 269
 270     /* Don't compress if it won't fit in the # of bits we have. */
 271     if (lp->skip + p[valid_ptr].skip >= (1 << 3)) {
 272         return;
 273     }
 274
 275     lp->ptr = p[valid_ptr].ptr;
 276     if (!p[valid_ptr].skip) {
 277         /* If our only child is a leaf, make this a leaf. */
 278         /* By design, we should have made this node a leaf to begin with so we
 279          * should never reach here.
 280          * But since it's so simple to handle this, let's do it just in case we
 281          * change this rule.
 282          */
 283         lp->skip = 0;
 284     } else {
 285         lp->skip += p[valid_ptr].skip;
 286     }
 287 }
 288
 289 static void phys_page_compact_all(AddressSpaceDispatch *d, int nodes_nb)
 290 {
 291     DECLARE_BITMAP(compacted, nodes_nb);
 292
 293     if (d->phys_map.skip) {
 294         phys_page_compact(&d->phys_map, d->map.nodes, compacted);
 295     }
 296 }
 297
 298 static MemoryRegionSection *phys_page_find(PhysPageEntry lp, hwaddr addr,
 299                                            Node *nodes, MemoryRegionSection *sections)
 300 {
 301     PhysPageEntry *p;
 302     hwaddr index = addr >> TARGET_PAGE_BITS;
 303     int i;
 304
 305     for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
 306         if (lp.ptr == PHYS_MAP_NODE_NIL) {
 307             return &sections[PHYS_SECTION_UNASSIGNED];
 308         }
 309         p = nodes[lp.ptr];
 310         lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
 311     }
 312
 313     if (sections[lp.ptr].size.hi ||
 314         range_covers_byte(sections[lp.ptr].offset_within_address_space,
 315                           sections[lp.ptr].size.lo, addr)) {
 316         return &sections[lp.ptr];
 317     } else {
 318         return &sections[PHYS_SECTION_UNASSIGNED];
 319     }
 320 }
 321
 322 bool memory_region_is_unassigned(MemoryRegion *mr)
 323 {
 324     return mr != &io_mem_rom && mr != &io_mem_notdirty && !mr->rom_device
 325         && mr != &io_mem_watch;
 326 }
 327
 328 /* Called from RCU critical section */
 329 static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
 330                                                         hwaddr addr,
 331                                                         bool resolve_subpage)
 332 {
 333     MemoryRegionSection *section;
 334     subpage_t *subpage;
 335
 336     section = phys_page_find(d->phys_map, addr, d->map.nodes, d->map.sections);
 337     if (resolve_subpage && section->mr->subpage) {
 338         subpage = container_of(section->mr, subpage_t, iomem);
 339         section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
 340     }
 341     return section;
 342 }
 343
 344 /* Called from RCU critical section */
 345 static MemoryRegionSection *
 346 address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *xlat,
 347                                  hwaddr *plen, bool resolve_subpage)
 348 {
 349     MemoryRegionSection *section;
 350     Int128 diff;
 351
 352     section = address_space_lookup_region(d, addr, resolve_subpage);
 353     /* Compute offset within MemoryRegionSection */
 354     addr -= section->offset_within_address_space;
 355
 356     /* Compute offset within MemoryRegion */
 357     *xlat = addr + section->offset_within_region;
 358
 359     diff = int128_sub(section->mr->size, int128_make64(addr));
 360     *plen = int128_get64(int128_min(diff, int128_make64(*plen)));
 361     return section;
 362 }
 363
 364 static inline bool memory_access_is_direct(MemoryRegion *mr, bool is_write)
 365 {
 366     if (memory_region_is_ram(mr)) {
 367         return !(is_write && mr->readonly);
 368     }
 369     if (memory_region_is_romd(mr)) {
 370         return !is_write;
 371     }
 372
 373     return false;
 374 }
 375
 376 MemoryRegion *address_space_translate(AddressSpace *as, hwaddr addr,
 377                                       hwaddr *xlat, hwaddr *plen,
 378                                       bool is_write)
 379 {
 380     IOMMUTLBEntry iotlb;
 381     MemoryRegionSection *section;
 382     MemoryRegion *mr;
 383
 384     rcu_read_lock();
 385     for (;;) {
 386         AddressSpaceDispatch *d = atomic_rcu_read(&as->dispatch);
 387         section = address_space_translate_internal(d, addr, &addr, plen, true);
 388         mr = section->mr;
 389
 390         if (!mr->iommu_ops) {
 391             break;
 392         }
 393
 394         iotlb = mr->iommu_ops->translate(mr, addr, is_write);
 395         addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
 396                 | (addr & iotlb.addr_mask));
 397         *plen = MIN(*plen, (addr | iotlb.addr_mask) - addr + 1);
 398         if (!(iotlb.perm & (1 << is_write))) {
 399             mr = &io_mem_unassigned;
 400             break;
 401         }
 402
 403         as = iotlb.target_as;
 404     }
 405
 406     if (xen_enabled() && memory_access_is_direct(mr, is_write)) {
 407         hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
 408         *plen = MIN(page, *plen);
 409     }
 410
 411     *xlat = addr;
 412     rcu_read_unlock();
 413     return mr;
 414 }
 415
 416 /* Called from RCU critical section */
 417 MemoryRegionSection *
 418 address_space_translate_for_iotlb(CPUState *cpu, hwaddr addr,
 419                                   hwaddr *xlat, hwaddr *plen)
 420 {
 421     MemoryRegionSection *section;
 422     section = address_space_translate_internal(cpu->memory_dispatch,
 423                                                addr, xlat, plen, false);
 424
 425     assert(!section->mr->iommu_ops);
 426     return section;
 427 }
 428 #endif
 429
 430 void cpu_exec_init_all(void)
 431 {
 432 #if !defined(CONFIG_USER_ONLY)
 433     qemu_mutex_init(&ram_list.mutex);
 434     memory_map_init();
 435     io_mem_init();
 436 #endif
 437 }
 438
 439 #if !defined(CONFIG_USER_ONLY)
 440
 441 static int cpu_common_post_load(void *opaque, int version_id)
 442 {
 443     CPUState *cpu = opaque;
 444
 445     /* 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the
 446        version_id is increased. */
 447     cpu->interrupt_request &= ~0x01;
 448     tlb_flush(cpu, 1);
 449
 450     return 0;
 451 }
 452
 453 static int cpu_common_pre_load(void *opaque)
 454 {
 455     CPUState *cpu = opaque;
 456
 457     cpu->exception_index = -1;
 458
 459     return 0;
 460 }
 461
 462 static bool cpu_common_exception_index_needed(void *opaque)
 463 {
 464     CPUState *cpu = opaque;
 465
 466     return tcg_enabled() && cpu->exception_index != -1;
 467 }
 468
 469 static const VMStateDescription vmstate_cpu_common_exception_index = {
 470     .name = "cpu_common/exception_index",
 471     .version_id = 1,
 472     .minimum_version_id = 1,
 473     .fields = (VMStateField[]) {
 474         VMSTATE_INT32(exception_index, CPUState),
 475         VMSTATE_END_OF_LIST()
 476     }
 477 };
 478
 479 const VMStateDescription vmstate_cpu_common = {
 480     .name = "cpu_common",
 481     .version_id = 1,
 482     .minimum_version_id = 1,
 483     .pre_load = cpu_common_pre_load,
 484     .post_load = cpu_common_post_load,
 485     .fields = (VMStateField[]) {
 486         VMSTATE_UINT32(halted, CPUState),
 487         VMSTATE_UINT32(interrupt_request, CPUState),
 488         VMSTATE_END_OF_LIST()
 489     },
 490     .subsections = (VMStateSubsection[]) {
 491         {
 492             .vmsd = &vmstate_cpu_common_exception_index,
 493             .needed = cpu_common_exception_index_needed,
 494         } , {
 495             /* empty */
 496         }
 497     }
 498 };
 499
 500 #endif
 501
 502 CPUState *qemu_get_cpu(int index)
 503 {
 504     CPUState *cpu;
 505
 506     CPU_FOREACH(cpu) {
 507         if (cpu->cpu_index == index) {
 508             return cpu;
 509         }
 510     }
 511
 512     return NULL;
 513 }
 514
 515 #if !defined(CONFIG_USER_ONLY)
 516 void tcg_cpu_address_space_init(CPUState *cpu, AddressSpace *as)
 517 {
 518     /* We only support one address space per cpu at the moment.  */
 519     assert(cpu->as == as);
 520
 521     if (cpu->tcg_as_listener) {
 522         memory_listener_unregister(cpu->tcg_as_listener);
 523     } else {
 524         cpu->tcg_as_listener = g_new0(MemoryListener, 1);
 525     }
 526     cpu->tcg_as_listener->commit = tcg_commit;
 527     memory_listener_register(cpu->tcg_as_listener, as);
 528 }
 529 #endif
 530
 531 void cpu_exec_init(CPUArchState *env)
 532 {
 533     CPUState *cpu = ENV_GET_CPU(env);
 534     CPUClass *cc = CPU_GET_CLASS(cpu);
 535     CPUState *some_cpu;
 536     int cpu_index;
 537
 538 #if defined(CONFIG_USER_ONLY)
 539     cpu_list_lock();
 540 #endif
 541     cpu_index = 0;
 542     CPU_FOREACH(some_cpu) {
 543         cpu_index++;
 544     }
 545     cpu->cpu_index = cpu_index;
 546     cpu->numa_node = 0;
 547     QTAILQ_INIT(&cpu->breakpoints);
 548     QTAILQ_INIT(&cpu->watchpoints);
 549 #ifndef CONFIG_USER_ONLY
 550     cpu->as = &address_space_memory;
 551     cpu->thread_id = qemu_get_thread_id();
 552     cpu_reload_memory_map(cpu);
 553 #endif
 554     QTAILQ_INSERT_TAIL(&cpus, cpu, node);
 555 #if defined(CONFIG_USER_ONLY)
 556     cpu_list_unlock();
 557 #endif
 558     if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
 559         vmstate_register(NULL, cpu_index, &vmstate_cpu_common, cpu);
 560     }
 561 #if defined(CPU_SAVE_VERSION) && !defined(CONFIG_USER_ONLY)
 562     register_savevm(NULL, "cpu", cpu_index, CPU_SAVE_VERSION,
 563                     cpu_save, cpu_load, env);
 564     assert(cc->vmsd == NULL);
 565     assert(qdev_get_vmsd(DEVICE(cpu)) == NULL);
 566 #endif
 567     if (cc->vmsd != NULL) {
 568         vmstate_register(NULL, cpu_index, cc->vmsd, cpu);
 569     }
 570 }
 571
 572 #if defined(CONFIG_USER_ONLY)
 573 static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
 574 {
 575     tb_invalidate_phys_page_range(pc, pc + 1, 0);
 576 }
 577 #else
 578 static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
 579 {
 580     hwaddr phys = cpu_get_phys_page_debug(cpu, pc);
 581     if (phys != -1) {
 582         tb_invalidate_phys_addr(cpu->as,
 583                                 phys | (pc & ~TARGET_PAGE_MASK));
 584     }
 585 }
 586 #endif
 587
 588 #if defined(CONFIG_USER_ONLY)
 589 void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
 590
 591 {
 592 }
 593
 594 int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
 595                           int flags)
 596 {
 597     return -ENOSYS;
 598 }
 599
 600 void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
 601 {
 602 }
 603
 604 int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
 605                           int flags, CPUWatchpoint **watchpoint)
 606 {
 607     return -ENOSYS;
 608 }
 609 #else
 610 /* Add a watchpoint.  */
 611 int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
 612                           int flags, CPUWatchpoint **watchpoint)
 613 {
 614     CPUWatchpoint *wp;
 615
 616     /* forbid ranges which are empty or run off the end of the address space */
 617     if (len == 0 || (addr + len - 1) < addr) {
 618         error_report("tried to set invalid watchpoint at %"
 619                      VADDR_PRIx ", len=%" VADDR_PRIu, addr, len);
 620         return -EINVAL;
 621     }
 622     wp = g_malloc(sizeof(*wp));
 623
 624     wp->vaddr = addr;
 625     wp->len = len;
 626     wp->flags = flags;
 627
 628     /* keep all GDB-injected watchpoints in front */
 629     if (flags & BP_GDB) {
 630         QTAILQ_INSERT_HEAD(&cpu->watchpoints, wp, entry);
 631     } else {
 632         QTAILQ_INSERT_TAIL(&cpu->watchpoints, wp, entry);
 633     }
 634
 635     tlb_flush_page(cpu, addr);
 636
 637     if (watchpoint)
 638         *watchpoint = wp;
 639     return 0;
 640 }
 641
 642 /* Remove a specific watchpoint.  */
 643 int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
 644                           int flags)
 645 {
 646     CPUWatchpoint *wp;
 647
 648     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
 649         if (addr == wp->vaddr && len == wp->len
 650                 && flags == (wp->flags & ~BP_WATCHPOINT_HIT)) {
 651             cpu_watchpoint_remove_by_ref(cpu, wp);
 652             return 0;
 653         }
 654     }
 655     return -ENOENT;
 656 }
 657
 658 /* Remove a specific watchpoint by reference.  */
 659 void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
 660 {
 661     QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry);
 662
 663     tlb_flush_page(cpu, watchpoint->vaddr);
 664
 665     g_free(watchpoint);
 666 }
 667
 668 /* Remove all matching watchpoints.  */
 669 void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
 670 {
 671     CPUWatchpoint *wp, *next;
 672
 673     QTAILQ_FOREACH_SAFE(wp, &cpu->watchpoints, entry, next) {
 674         if (wp->flags & mask) {
 675             cpu_watchpoint_remove_by_ref(cpu, wp);
 676         }
 677     }
 678 }
 679
 680 /* Return true if this watchpoint address matches the specified
 681  * access (ie the address range covered by the watchpoint overlaps
 682  * partially or completely with the address range covered by the
 683  * access).
 684  */
 685 static inline bool cpu_watchpoint_address_matches(CPUWatchpoint *wp,
 686                                                   vaddr addr,
 687                                                   vaddr len)
 688 {
 689     /* We know the lengths are non-zero, but a little caution is
 690      * required to avoid errors in the case where the range ends
 691      * exactly at the top of the address space and so addr + len
 692      * wraps round to zero.
 693      */
 694     vaddr wpend = wp->vaddr + wp->len - 1;
 695     vaddr addrend = addr + len - 1;
 696
 697     return !(addr > wpend || wp->vaddr > addrend);
 698 }
 699
 700 #endif
 701
 702 /* Add a breakpoint.  */
 703 int cpu_breakpoint_insert(CPUState *cpu, vaddr pc, int flags,
 704                           CPUBreakpoint **breakpoint)
 705 {
 706     CPUBreakpoint *bp;
 707
 708     bp = g_malloc(sizeof(*bp));
 709
 710     bp->pc = pc;
 711     bp->flags = flags;
 712
 713     /* keep all GDB-injected breakpoints in front */
 714     if (flags & BP_GDB) {
 715         QTAILQ_INSERT_HEAD(&cpu->breakpoints, bp, entry);
 716     } else {
 717         QTAILQ_INSERT_TAIL(&cpu->breakpoints, bp, entry);
 718     }
 719
 720     breakpoint_invalidate(cpu, pc);
 721
 722     if (breakpoint) {
 723         *breakpoint = bp;
 724     }
 725     return 0;
 726 }
 727
 728 /* Remove a specific breakpoint.  */
 729 int cpu_breakpoint_remove(CPUState *cpu, vaddr pc, int flags)
 730 {
 731     CPUBreakpoint *bp;
 732
 733     QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
 734         if (bp->pc == pc && bp->flags == flags) {
 735             cpu_breakpoint_remove_by_ref(cpu, bp);
 736             return 0;
 737         }
 738     }
 739     return -ENOENT;
 740 }
 741
 742 /* Remove a specific breakpoint by reference.  */
 743 void cpu_breakpoint_remove_by_ref(CPUState *cpu, CPUBreakpoint *breakpoint)
 744 {
 745     QTAILQ_REMOVE(&cpu->breakpoints, breakpoint, entry);
 746
 747     breakpoint_invalidate(cpu, breakpoint->pc);
 748
 749     g_free(breakpoint);
 750 }
 751
 752 /* Remove all matching breakpoints. */
 753 void cpu_breakpoint_remove_all(CPUState *cpu, int mask)
 754 {
 755     CPUBreakpoint *bp, *next;
 756
 757     QTAILQ_FOREACH_SAFE(bp, &cpu->breakpoints, entry, next) {
 758         if (bp->flags & mask) {
 759             cpu_breakpoint_remove_by_ref(cpu, bp);
 760         }
 761     }
 762 }
 763
 764 /* enable or disable single step mode. EXCP_DEBUG is returned by the
 765    CPU loop after each instruction */
 766 void cpu_single_step(CPUState *cpu, int enabled)
 767 {
 768     if (cpu->singlestep_enabled != enabled) {
 769         cpu->singlestep_enabled = enabled;
 770         if (kvm_enabled()) {
 771             kvm_update_guest_debug(cpu, 0);
 772         } else {
 773             /* must flush all the translated code to avoid inconsistencies */
 774             /* XXX: only flush what is necessary */
 775             CPUArchState *env = cpu->env_ptr;
 776             tb_flush(env);
 777         }
 778     }
 779 }
 780
 781 void cpu_abort(CPUState *cpu, const char *fmt, ...)
 782 {
 783     va_list ap;
 784     va_list ap2;
 785
 786     va_start(ap, fmt);
 787     va_copy(ap2, ap);
 788     fprintf(stderr, "qemu: fatal: ");
 789     vfprintf(stderr, fmt, ap);
 790     fprintf(stderr, "\n");
 791     cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU | CPU_DUMP_CCOP);
 792     if (qemu_log_enabled()) {
 793         qemu_log("qemu: fatal: ");
 794         qemu_log_vprintf(fmt, ap2);
 795         qemu_log("\n");
 796         log_cpu_state(cpu, CPU_DUMP_FPU | CPU_DUMP_CCOP);
 797         qemu_log_flush();
 798         qemu_log_close();
 799     }
 800     va_end(ap2);
 801     va_end(ap);
 802 #if defined(CONFIG_USER_ONLY)
 803     {
 804         struct sigaction act;
 805         sigfillset(&act.sa_mask);
 806         act.sa_handler = SIG_DFL;
 807         sigaction(SIGABRT, &act, NULL);
 808     }
 809 #endif
 810     abort();
 811 }
 812
 813 #if !defined(CONFIG_USER_ONLY)
 814 /* Called from RCU critical section */
 815 static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
 816 {
 817     RAMBlock *block;
 818
 819     block = atomic_rcu_read(&ram_list.mru_block);
 820     if (block && addr - block->offset < block->max_length) {
 821         goto found;
 822     }
 823     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 824         if (addr - block->offset < block->max_length) {
 825             goto found;
 826         }
 827     }
 828
 829     fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
 830     abort();
 831
 832 found:
 833     /* It is safe to write mru_block outside the iothread lock.  This
 834      * is what happens:
 835      *
 836      *     mru_block = xxx
 837      *     rcu_read_unlock()
 838      *                                        xxx removed from list
 839      *                  rcu_read_lock()
 840      *                  read mru_block
 841      *                                        mru_block = NULL;
 842      *                                        call_rcu(reclaim_ramblock, xxx);
 843      *                  rcu_read_unlock()
 844      *
 845      * atomic_rcu_set is not needed here.  The block was already published
 846      * when it was placed into the list.  Here we're just making an extra
 847      * copy of the pointer.
 848      */
 849     ram_list.mru_block = block;
 850     return block;
 851 }
 852
 853 static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
 854 {
 855     ram_addr_t start1;
 856     RAMBlock *block;
 857     ram_addr_t end;
 858
 859     end = TARGET_PAGE_ALIGN(start + length);
 860     start &= TARGET_PAGE_MASK;
 861
 862     rcu_read_lock();
 863     block = qemu_get_ram_block(start);
 864     assert(block == qemu_get_ram_block(end - 1));
 865     start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
 866     cpu_tlb_reset_dirty_all(start1, length);
 867     rcu_read_unlock();
 868 }
 869
 870 /* Note: start and end must be within the same ram block.  */
 871 void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t length,
 872                                      unsigned client)
 873 {
 874     if (length == 0)
 875         return;
 876     cpu_physical_memory_clear_dirty_range_type(start, length, client);
 877
 878     if (tcg_enabled()) {
 879         tlb_reset_dirty_range_all(start, length);
 880     }
 881 }
 882
 883 static void cpu_physical_memory_set_dirty_tracking(bool enable)
 884 {
 885     in_migration = enable;
 886 }
 887
 888 /* Called from RCU critical section */
 889 hwaddr memory_region_section_get_iotlb(CPUState *cpu,
 890                                        MemoryRegionSection *section,
 891                                        target_ulong vaddr,
 892                                        hwaddr paddr, hwaddr xlat,
 893                                        int prot,
 894                                        target_ulong *address)
 895 {
 896     hwaddr iotlb;
 897     CPUWatchpoint *wp;
 898
 899     if (memory_region_is_ram(section->mr)) {
 900         /* Normal RAM.  */
 901         iotlb = (memory_region_get_ram_addr(section->mr) & TARGET_PAGE_MASK)
 902             + xlat;
 903         if (!section->readonly) {
 904             iotlb |= PHYS_SECTION_NOTDIRTY;
 905         } else {
 906             iotlb |= PHYS_SECTION_ROM;
 907         }
 908     } else {
 909         iotlb = section - section->address_space->dispatch->map.sections;
 910         iotlb += xlat;
 911     }
 912
 913     /* Make accesses to pages with watchpoints go via the
 914        watchpoint trap routines.  */
 915     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
 916         if (cpu_watchpoint_address_matches(wp, vaddr, TARGET_PAGE_SIZE)) {
 917             /* Avoid trapping reads of pages with a write breakpoint. */
 918             if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) {
 919                 iotlb = PHYS_SECTION_WATCH + paddr;
 920                 *address |= TLB_MMIO;
 921                 break;
 922             }
 923         }
 924     }
 925
 926     return iotlb;
 927 }
 928 #endif /* defined(CONFIG_USER_ONLY) */
 929
 930 #if !defined(CONFIG_USER_ONLY)
 931
 932 static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
 933                              uint16_t section);
 934 static subpage_t *subpage_init(AddressSpace *as, hwaddr base);
 935
 936 static void *(*phys_mem_alloc)(size_t size, uint64_t *align) =
 937                                qemu_anon_ram_alloc;
 938
 939 /*
 940  * Set a custom physical guest memory alloator.
 941  * Accelerators with unusual needs may need this.  Hopefully, we can
 942  * get rid of it eventually.
 943  */
 944 void phys_mem_set_alloc(void *(*alloc)(size_t, uint64_t *align))
 945 {
 946     phys_mem_alloc = alloc;
 947 }
 948
 949 static uint16_t phys_section_add(PhysPageMap *map,
 950                                  MemoryRegionSection *section)
 951 {
 952     /* The physical section number is ORed with a page-aligned
 953      * pointer to produce the iotlb entries.  Thus it should
 954      * never overflow into the page-aligned value.
 955      */
 956     assert(map->sections_nb < TARGET_PAGE_SIZE);
 957
 958     if (map->sections_nb == map->sections_nb_alloc) {
 959         map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
 960         map->sections = g_renew(MemoryRegionSection, map->sections,
 961                                 map->sections_nb_alloc);
 962     }
 963     map->sections[map->sections_nb] = *section;
 964     memory_region_ref(section->mr);
 965     return map->sections_nb++;
 966 }
 967
 968 static void phys_section_destroy(MemoryRegion *mr)
 969 {
 970     memory_region_unref(mr);
 971
 972     if (mr->subpage) {
 973         subpage_t *subpage = container_of(mr, subpage_t, iomem);
 974         object_unref(OBJECT(&subpage->iomem));
 975         g_free(subpage);
 976     }
 977 }
 978
 979 static void phys_sections_free(PhysPageMap *map)
 980 {
 981     while (map->sections_nb > 0) {
 982         MemoryRegionSection *section = &map->sections[--map->sections_nb];
 983         phys_section_destroy(section->mr);
 984     }
 985     g_free(map->sections);
 986     g_free(map->nodes);
 987 }
 988
 989 static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *section)
 990 {
 991     subpage_t *subpage;
 992     hwaddr base = section->offset_within_address_space
 993         & TARGET_PAGE_MASK;
 994     MemoryRegionSection *existing = phys_page_find(d->phys_map, base,
 995                                                    d->map.nodes, d->map.sections);
 996     MemoryRegionSection subsection = {
 997         .offset_within_address_space = base,
 998         .size = int128_make64(TARGET_PAGE_SIZE),
 999     };
1000     hwaddr start, end;
1001
1002     assert(existing->mr->subpage || existing->mr == &io_mem_unassigned);
1003
1004     if (!(existing->mr->subpage)) {
1005         subpage = subpage_init(d->as, base);
1006         subsection.address_space = d->as;
1007         subsection.mr = &subpage->iomem;
1008         phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
1009                       phys_section_add(&d->map, &subsection));
1010     } else {
1011         subpage = container_of(existing->mr, subpage_t, iomem);
1012     }
1013     start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
1014     end = start + int128_get64(section->size) - 1;
1015     subpage_register(subpage, start, end,
1016                      phys_section_add(&d->map, section));
1017 }
1018
1019
1020 static void register_multipage(AddressSpaceDispatch *d,
1021                                MemoryRegionSection *section)
1022 {
1023     hwaddr start_addr = section->offset_within_address_space;
1024     uint16_t section_index = phys_section_add(&d->map, section);
1025     uint64_t num_pages = int128_get64(int128_rshift(section->size,
1026                                                     TARGET_PAGE_BITS));
1027
1028     assert(num_pages);
1029     phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
1030 }
1031
1032 static void mem_add(MemoryListener *listener, MemoryRegionSection *section)
1033 {
1034     AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
1035     AddressSpaceDispatch *d = as->next_dispatch;
1036     MemoryRegionSection now = *section, remain = *section;
1037     Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
1038
1039     if (now.offset_within_address_space & ~TARGET_PAGE_MASK) {
1040         uint64_t left = TARGET_PAGE_ALIGN(now.offset_within_address_space)
1041                        - now.offset_within_address_space;
1042
1043         now.size = int128_min(int128_make64(left), now.size);
1044         register_subpage(d, &now);
1045     } else {
1046         now.size = int128_zero();
1047     }
1048     while (int128_ne(remain.size, now.size)) {
1049         remain.size = int128_sub(remain.size, now.size);
1050         remain.offset_within_address_space += int128_get64(now.size);
1051         remain.offset_within_region += int128_get64(now.size);
1052         now = remain;
1053         if (int128_lt(remain.size, page_size)) {
1054             register_subpage(d, &now);
1055         } else if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
1056             now.size = page_size;
1057             register_subpage(d, &now);
1058         } else {
1059             now.size = int128_and(now.size, int128_neg(page_size));
1060             register_multipage(d, &now);
1061         }
1062     }
1063 }
1064
1065 void qemu_flush_coalesced_mmio_buffer(void)
1066 {
1067     if (kvm_enabled())
1068         kvm_flush_coalesced_mmio_buffer();
1069 }
1070
1071 void qemu_mutex_lock_ramlist(void)
1072 {
1073     qemu_mutex_lock(&ram_list.mutex);
1074 }
1075
1076 void qemu_mutex_unlock_ramlist(void)
1077 {
1078     qemu_mutex_unlock(&ram_list.mutex);
1079 }
1080
1081 #ifdef __linux__
1082
1083 #include <sys/vfs.h>
1084
1085 #define HUGETLBFS_MAGIC       0x958458f6
1086
1087 static long gethugepagesize(const char *path, Error **errp)
1088 {
1089     struct statfs fs;
1090     int ret;
1091
1092     do {
1093         ret = statfs(path, &fs);
1094     } while (ret != 0 && errno == EINTR);
1095
1096     if (ret != 0) {
1097         error_setg_errno(errp, errno, "failed to get page size of file %s",
1098                          path);
1099         return 0;
1100     }
1101
1102     if (fs.f_type != HUGETLBFS_MAGIC)
1103         fprintf(stderr, "Warning: path not on HugeTLBFS: %s\n", path);
1104
1105     return fs.f_bsize;
1106 }
1107
1108 static void *file_ram_alloc(RAMBlock *block,
1109                             ram_addr_t memory,
1110                             const char *path,
1111                             Error **errp)
1112 {
1113     char *filename;
1114     char *sanitized_name;
1115     char *c;
1116     void *area = NULL;
1117     int fd;
1118     uint64_t hpagesize;
1119     Error *local_err = NULL;
1120
1121     hpagesize = gethugepagesize(path, &local_err);
1122     if (local_err) {
1123         error_propagate(errp, local_err);
1124         goto error;
1125     }
1126     block->mr->align = hpagesize;
1127
1128     if (memory < hpagesize) {
1129         error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
1130                    "or larger than huge page size 0x%" PRIx64,
1131                    memory, hpagesize);
1132         goto error;
1133     }
1134
1135     if (kvm_enabled() && !kvm_has_sync_mmu()) {
1136         error_setg(errp,
1137                    "host lacks kvm mmu notifiers, -mem-path unsupported");
1138         goto error;
1139     }
1140
1141     /* Make name safe to use with mkstemp by replacing '/' with '_'. */
1142     sanitized_name = g_strdup(memory_region_name(block->mr));
1143     for (c = sanitized_name; *c != '\0'; c++) {
1144         if (*c == '/')
1145             *c = '_';
1146     }
1147
1148     filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
1149                                sanitized_name);
1150     g_free(sanitized_name);
1151
1152     fd = mkstemp(filename);
1153     if (fd < 0) {
1154         error_setg_errno(errp, errno,
1155                          "unable to create backing store for hugepages");
1156         g_free(filename);
1157         goto error;
1158     }
1159     unlink(filename);
1160     g_free(filename);
1161
1162     memory = (memory+hpagesize-1) & ~(hpagesize-1);
1163
1164     /*
1165      * ftruncate is not supported by hugetlbfs in older
1166      * hosts, so don't bother bailing out on errors.
1167      * If anything goes wrong with it under other filesystems,
1168      * mmap will fail.
1169      */
1170     if (ftruncate(fd, memory)) {
1171         perror("ftruncate");
1172     }
1173
1174     area = mmap(0, memory, PROT_READ | PROT_WRITE,
1175                 (block->flags & RAM_SHARED ? MAP_SHARED : MAP_PRIVATE),
1176                 fd, 0);
1177     if (area == MAP_FAILED) {
1178         error_setg_errno(errp, errno,
1179                          "unable to map backing store for hugepages");
1180         close(fd);
1181         goto error;
1182     }
1183
1184     if (mem_prealloc) {
1185         os_mem_prealloc(fd, area, memory);
1186     }
1187
1188     block->fd = fd;
1189     return area;
1190
1191 error:
1192     if (mem_prealloc) {
1193         error_report("%s", error_get_pretty(*errp));
1194         exit(1);
1195     }
1196     return NULL;
1197 }
1198 #endif
1199
1200 /* Called with the ramlist lock held.  */
1201 static ram_addr_t find_ram_offset(ram_addr_t size)
1202 {
1203     RAMBlock *block, *next_block;
1204     ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
1205
1206     assert(size != 0); /* it would hand out same offset multiple times */
1207
1208     if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
1209         return 0;
1210     }
1211
1212     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1213         ram_addr_t end, next = RAM_ADDR_MAX;
1214
1215         end = block->offset + block->max_length;
1216
1217         QLIST_FOREACH_RCU(next_block, &ram_list.blocks, next) {
1218             if (next_block->offset >= end) {
1219                 next = MIN(next, next_block->offset);
1220             }
1221         }
1222         if (next - end >= size && next - end < mingap) {
1223             offset = end;
1224             mingap = next - end;
1225         }
1226     }
1227
1228     if (offset == RAM_ADDR_MAX) {
1229         fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
1230                 (uint64_t)size);
1231         abort();
1232     }
1233
1234     return offset;
1235 }
1236
1237 ram_addr_t last_ram_offset(void)
1238 {
1239     RAMBlock *block;
1240     ram_addr_t last = 0;
1241
1242     rcu_read_lock();
1243     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1244         last = MAX(last, block->offset + block->max_length);
1245     }
1246     rcu_read_unlock();
1247     return last;
1248 }
1249
1250 static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
1251 {
1252     int ret;
1253
1254     /* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
1255     if (!machine_dump_guest_core(current_machine)) {
1256         ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
1257         if (ret) {
1258             perror("qemu_madvise");
1259             fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
1260                             "but dump_guest_core=off specified\n");
1261         }
1262     }
1263 }
1264
1265 /* Called within an RCU critical section, or while the ramlist lock
1266  * is held.
1267  */
1268 static RAMBlock *find_ram_block(ram_addr_t addr)
1269 {
1270     RAMBlock *block;
1271
1272     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1273         if (block->offset == addr) {
1274             return block;
1275         }
1276     }
1277
1278     return NULL;
1279 }
1280
1281 /* Called with iothread lock held.  */
1282 void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev)
1283 {
1284     RAMBlock *new_block, *block;
1285
1286     rcu_read_lock();
1287     new_block = find_ram_block(addr);
1288     assert(new_block);
1289     assert(!new_block->idstr[0]);
1290
1291     if (dev) {
1292         char *id = qdev_get_dev_path(dev);
1293         if (id) {
1294             snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
1295             g_free(id);
1296         }
1297     }
1298     pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
1299
1300     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1301         if (block != new_block && !strcmp(block->idstr, new_block->idstr)) {
1302             fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
1303                     new_block->idstr);
1304             abort();
1305         }
1306     }
1307     rcu_read_unlock();
1308 }
1309
1310 /* Called with iothread lock held.  */
1311 void qemu_ram_unset_idstr(ram_addr_t addr)
1312 {
1313     RAMBlock *block;
1314
1315     /* FIXME: arch_init.c assumes that this is not called throughout
1316      * migration.  Ignore the problem since hot-unplug during migration
1317      * does not work anyway.
1318      */
1319
1320     rcu_read_lock();
1321     block = find_ram_block(addr);
1322     if (block) {
1323         memset(block->idstr, 0, sizeof(block->idstr));
1324     }
1325     rcu_read_unlock();
1326 }
1327
1328 static int memory_try_enable_merging(void *addr, size_t len)
1329 {
1330     if (!machine_mem_merge(current_machine)) {
1331         /* disabled by the user */
1332         return 0;
1333     }
1334
1335     return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
1336 }
1337
1338 /* Only legal before guest might have detected the memory size: e.g. on
1339  * incoming migration, or right after reset.
1340  *
1341  * As memory core doesn't know how is memory accessed, it is up to
1342  * resize callback to update device state and/or add assertions to detect
1343  * misuse, if necessary.
1344  */
1345 int qemu_ram_resize(ram_addr_t base, ram_addr_t newsize, Error **errp)
1346 {
1347     RAMBlock *block = find_ram_block(base);
1348
1349     assert(block);
1350
1351     newsize = TARGET_PAGE_ALIGN(newsize);
1352
1353     if (block->used_length == newsize) {
1354         return 0;
1355     }
1356
1357     if (!(block->flags & RAM_RESIZEABLE)) {
1358         error_setg_errno(errp, EINVAL,
1359                          "Length mismatch: %s: 0x" RAM_ADDR_FMT
1360                          " in != 0x" RAM_ADDR_FMT, block->idstr,
1361                          newsize, block->used_length);
1362         return -EINVAL;
1363     }
1364
1365     if (block->max_length < newsize) {
1366         error_setg_errno(errp, EINVAL,
1367                          "Length too large: %s: 0x" RAM_ADDR_FMT
1368                          " > 0x" RAM_ADDR_FMT, block->idstr,
1369                          newsize, block->max_length);
1370         return -EINVAL;
1371     }
1372
1373     cpu_physical_memory_clear_dirty_range(block->offset, block->used_length);
1374     block->used_length = newsize;
1375     cpu_physical_memory_set_dirty_range(block->offset, block->used_length);
1376     memory_region_set_size(block->mr, newsize);
1377     if (block->resized) {
1378         block->resized(block->idstr, newsize, block->host);
1379     }
1380     return 0;
1381 }
1382
1383 static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp)
1384 {
1385     RAMBlock *block;
1386     RAMBlock *last_block = NULL;
1387     ram_addr_t old_ram_size, new_ram_size;
1388
1389     old_ram_size = last_ram_offset() >> TARGET_PAGE_BITS;
1390
1391     qemu_mutex_lock_ramlist();
1392     new_block->offset = find_ram_offset(new_block->max_length);
1393
1394     if (!new_block->host) {
1395         if (xen_enabled()) {
1396             xen_ram_alloc(new_block->offset, new_block->max_length,
1397                           new_block->mr);
1398         } else {
1399             new_block->host = phys_mem_alloc(new_block->max_length,
1400                                              &new_block->mr->align);
1401             if (!new_block->host) {
1402                 error_setg_errno(errp, errno,
1403                                  "cannot set up guest memory '%s'",
1404                                  memory_region_name(new_block->mr));
1405                 qemu_mutex_unlock_ramlist();
1406                 return -1;
1407             }
1408             memory_try_enable_merging(new_block->host, new_block->max_length);
1409         }
1410     }
1411
1412     /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
1413      * QLIST (which has an RCU-friendly variant) does not have insertion at
1414      * tail, so save the last element in last_block.
1415      */
1416     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1417         last_block = block;
1418         if (block->max_length < new_block->max_length) {
1419             break;
1420         }
1421     }
1422     if (block) {
1423         QLIST_INSERT_BEFORE_RCU(block, new_block, next);
1424     } else if (last_block) {
1425         QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
1426     } else { /* list is empty */
1427         QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
1428     }
1429     ram_list.mru_block = NULL;
1430
1431     /* Write list before version */
1432     smp_wmb();
1433     ram_list.version++;
1434     qemu_mutex_unlock_ramlist();
1435
1436     new_ram_size = last_ram_offset() >> TARGET_PAGE_BITS;
1437
1438     if (new_ram_size > old_ram_size) {
1439         int i;
1440
1441         /* ram_list.dirty_memory[] is protected by the iothread lock.  */
1442         for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
1443             ram_list.dirty_memory[i] =
1444                 bitmap_zero_extend(ram_list.dirty_memory[i],
1445                                    old_ram_size, new_ram_size);
1446        }
1447     }
1448     cpu_physical_memory_set_dirty_range(new_block->offset,
1449                                         new_block->used_length);
1450
1451     if (new_block->host) {
1452         qemu_ram_setup_dump(new_block->host, new_block->max_length);
1453         qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
1454         qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_DONTFORK);
1455         if (kvm_enabled()) {
1456             kvm_setup_guest_memory(new_block->host, new_block->max_length);
1457         }
1458     }
1459
1460     return new_block->offset;
1461 }
1462
1463 #ifdef __linux__
1464 ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
1465                                     bool share, const char *mem_path,
1466                                     Error **errp)
1467 {
1468     RAMBlock *new_block;
1469     ram_addr_t addr;
1470     Error *local_err = NULL;
1471
1472     if (xen_enabled()) {
1473         error_setg(errp, "-mem-path not supported with Xen");
1474         return -1;
1475     }
1476
1477     if (phys_mem_alloc != qemu_anon_ram_alloc) {
1478         /*
1479          * file_ram_alloc() needs to allocate just like
1480          * phys_mem_alloc, but we haven't bothered to provide
1481          * a hook there.
1482          */
1483         error_setg(errp,
1484                    "-mem-path not supported with this accelerator");
1485         return -1;
1486     }
1487
1488     size = TARGET_PAGE_ALIGN(size);
1489     new_block = g_malloc0(sizeof(*new_block));
1490     new_block->mr = mr;
1491     new_block->used_length = size;
1492     new_block->max_length = size;
1493     new_block->flags = share ? RAM_SHARED : 0;
1494     new_block->host = file_ram_alloc(new_block, size,
1495                                      mem_path, errp);
1496     if (!new_block->host) {
1497         g_free(new_block);
1498         return -1;
1499     }
1500
1501     addr = ram_block_add(new_block, &local_err);
1502     if (local_err) {
1503         g_free(new_block);
1504         error_propagate(errp, local_err);
1505         return -1;
1506     }
1507     return addr;
1508 }
1509 #endif
1510
1511 static
1512 ram_addr_t qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
1513                                    void (*resized)(const char*,
1514                                                    uint64_t length,
1515                                                    void *host),
1516                                    void *host, bool resizeable,
1517                                    MemoryRegion *mr, Error **errp)
1518 {
1519     RAMBlock *new_block;
1520     ram_addr_t addr;
1521     Error *local_err = NULL;
1522
1523     size = TARGET_PAGE_ALIGN(size);
1524     max_size = TARGET_PAGE_ALIGN(max_size);
1525     new_block = g_malloc0(sizeof(*new_block));
1526     new_block->mr = mr;
1527     new_block->resized = resized;
1528     new_block->used_length = size;
1529     new_block->max_length = max_size;
1530     assert(max_size >= size);
1531     new_block->fd = -1;
1532     new_block->host = host;
1533     if (host) {
1534         new_block->flags |= RAM_PREALLOC;
1535     }
1536     if (resizeable) {
1537         new_block->flags |= RAM_RESIZEABLE;
1538     }
1539     addr = ram_block_add(new_block, &local_err);
1540     if (local_err) {
1541         g_free(new_block);
1542         error_propagate(errp, local_err);
1543         return -1;
1544     }
1545     return addr;
1546 }
1547
1548 ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
1549                                    MemoryRegion *mr, Error **errp)
1550 {
1551     return qemu_ram_alloc_internal(size, size, NULL, host, false, mr, errp);
1552 }
1553
1554 ram_addr_t qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr, Error **errp)
1555 {
1556     return qemu_ram_alloc_internal(size, size, NULL, NULL, false, mr, errp);
1557 }
1558
1559 ram_addr_t qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
1560                                      void (*resized)(const char*,
1561                                                      uint64_t length,
1562                                                      void *host),
1563                                      MemoryRegion *mr, Error **errp)
1564 {
1565     return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true, mr, errp);
1566 }
1567
1568 void qemu_ram_free_from_ptr(ram_addr_t addr)
1569 {
1570     RAMBlock *block;
1571
1572     qemu_mutex_lock_ramlist();
1573     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1574         if (addr == block->offset) {
1575             QLIST_REMOVE_RCU(block, next);
1576             ram_list.mru_block = NULL;
1577             /* Write list before version */
1578             smp_wmb();
1579             ram_list.version++;
1580             g_free_rcu(block, rcu);
1581             break;
1582         }
1583     }
1584     qemu_mutex_unlock_ramlist();
1585 }
1586
1587 static void reclaim_ramblock(RAMBlock *block)
1588 {
1589     if (block->flags & RAM_PREALLOC) {
1590         ;
1591     } else if (xen_enabled()) {
1592         xen_invalidate_map_cache_entry(block->host);
1593 #ifndef _WIN32
1594     } else if (block->fd >= 0) {
1595         munmap(block->host, block->max_length);
1596         close(block->fd);
1597 #endif
1598     } else {
1599         qemu_anon_ram_free(block->host, block->max_length);
1600     }
1601     g_free(block);
1602 }
1603
1604 void qemu_ram_free(ram_addr_t addr)
1605 {
1606     RAMBlock *block;
1607
1608     qemu_mutex_lock_ramlist();
1609     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1610         if (addr == block->offset) {
1611             QLIST_REMOVE_RCU(block, next);
1612             ram_list.mru_block = NULL;
1613             /* Write list before version */
1614             smp_wmb();
1615             ram_list.version++;
1616             call_rcu(block, reclaim_ramblock, rcu);
1617             break;
1618         }
1619     }
1620     qemu_mutex_unlock_ramlist();
1621 }
1622
1623 #ifndef _WIN32
1624 void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
1625 {
1626     RAMBlock *block;
1627     ram_addr_t offset;
1628     int flags;
1629     void *area, *vaddr;
1630
1631     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1632         offset = addr - block->offset;
1633         if (offset < block->max_length) {
1634             vaddr = ramblock_ptr(block, offset);
1635             if (block->flags & RAM_PREALLOC) {
1636                 ;
1637             } else if (xen_enabled()) {
1638                 abort();
1639             } else {
1640                 flags = MAP_FIXED;
1641                 munmap(vaddr, length);
1642                 if (block->fd >= 0) {
1643                     flags |= (block->flags & RAM_SHARED ?
1644                               MAP_SHARED : MAP_PRIVATE);
1645                     area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
1646                                 flags, block->fd, offset);
1647                 } else {
1648                     /*
1649                      * Remap needs to match alloc.  Accelerators that
1650                      * set phys_mem_alloc never remap.  If they did,
1651                      * we'd need a remap hook here.
1652                      */
1653                     assert(phys_mem_alloc == qemu_anon_ram_alloc);
1654
1655                     flags |= MAP_PRIVATE | MAP_ANONYMOUS;
1656                     area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
1657                                 flags, -1, 0);
1658                 }
1659                 if (area != vaddr) {
1660                     fprintf(stderr, "Could not remap addr: "
1661                             RAM_ADDR_FMT "@" RAM_ADDR_FMT "\n",
1662                             length, addr);
1663                     exit(1);
1664                 }
1665                 memory_try_enable_merging(vaddr, length);
1666                 qemu_ram_setup_dump(vaddr, length);
1667             }
1668         }
1669     }
1670 }
1671 #endif /* !_WIN32 */
1672
1673 int qemu_get_ram_fd(ram_addr_t addr)
1674 {
1675     RAMBlock *block;
1676     int fd;
1677
1678     rcu_read_lock();
1679     block = qemu_get_ram_block(addr);
1680     fd = block->fd;
1681     rcu_read_unlock();
1682     return fd;
1683 }
1684
1685 void *qemu_get_ram_block_host_ptr(ram_addr_t addr)
1686 {
1687     RAMBlock *block;
1688     void *ptr;
1689
1690     rcu_read_lock();
1691     block = qemu_get_ram_block(addr);
1692     ptr = ramblock_ptr(block, 0);
1693     rcu_read_unlock();
1694     return ptr;
1695 }
1696
1697 /* Return a host pointer to ram allocated with qemu_ram_alloc.
1698  * This should not be used for general purpose DMA.  Use address_space_map
1699  * or address_space_rw instead. For local memory (e.g. video ram) that the
1700  * device owns, use memory_region_get_ram_ptr.
1701  *
1702  * By the time this function returns, the returned pointer is not protected
1703  * by RCU anymore.  If the caller is not within an RCU critical section and
1704  * does not hold the iothread lock, it must have other means of protecting the
1705  * pointer, such as a reference to the region that includes the incoming
1706  * ram_addr_t.
1707  */
1708 void *qemu_get_ram_ptr(ram_addr_t addr)
1709 {
1710     RAMBlock *block;
1711     void *ptr;
1712
1713     rcu_read_lock();
1714     block = qemu_get_ram_block(addr);
1715
1716     if (xen_enabled() && block->host == NULL) {
1717         /* We need to check if the requested address is in the RAM
1718          * because we don't want to map the entire memory in QEMU.
1719          * In that case just map until the end of the page.
1720          */
1721         if (block->offset == 0) {
1722             ptr = xen_map_cache(addr, 0, 0);
1723             goto unlock;
1724         }
1725
1726         block->host = xen_map_cache(block->offset, block->max_length, 1);
1727     }
1728     ptr = ramblock_ptr(block, addr - block->offset);
1729
1730 unlock:
1731     rcu_read_unlock();
1732     return ptr;
1733 }
1734
1735 /* Return a host pointer to guest's ram. Similar to qemu_get_ram_ptr
1736  * but takes a size argument.
1737  *
1738  * By the time this function returns, the returned pointer is not protected
1739  * by RCU anymore.  If the caller is not within an RCU critical section and
1740  * does not hold the iothread lock, it must have other means of protecting the
1741  * pointer, such as a reference to the region that includes the incoming
1742  * ram_addr_t.
1743  */
1744 static void *qemu_ram_ptr_length(ram_addr_t addr, hwaddr *size)
1745 {
1746     void *ptr;
1747     if (*size == 0) {
1748         return NULL;
1749     }
1750     if (xen_enabled()) {
1751         return xen_map_cache(addr, *size, 1);
1752     } else {
1753         RAMBlock *block;
1754         rcu_read_lock();
1755         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1756             if (addr - block->offset < block->max_length) {
1757                 if (addr - block->offset + *size > block->max_length)
1758                     *size = block->max_length - addr + block->offset;
1759                 ptr = ramblock_ptr(block, addr - block->offset);
1760                 rcu_read_unlock();
1761                 return ptr;
1762             }
1763         }
1764
1765         fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
1766         abort();
1767     }
1768 }
1769
1770 /* Some of the softmmu routines need to translate from a host pointer
1771  * (typically a TLB entry) back to a ram offset.
1772  *
1773  * By the time this function returns, the returned pointer is not protected
1774  * by RCU anymore.  If the caller is not within an RCU critical section and
1775  * does not hold the iothread lock, it must have other means of protecting the
1776  * pointer, such as a reference to the region that includes the incoming
1777  * ram_addr_t.
1778  */
1779 MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
1780 {
1781     RAMBlock *block;
1782     uint8_t *host = ptr;
1783     MemoryRegion *mr;
1784
1785     if (xen_enabled()) {
1786         rcu_read_lock();
1787         *ram_addr = xen_ram_addr_from_mapcache(ptr);
1788         mr = qemu_get_ram_block(*ram_addr)->mr;
1789         rcu_read_unlock();
1790         return mr;
1791     }
1792
1793     rcu_read_lock();
1794     block = atomic_rcu_read(&ram_list.mru_block);
1795     if (block && block->host && host - block->host < block->max_length) {
1796         goto found;
1797     }
1798
1799     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1800         /* This case append when the block is not mapped. */
1801         if (block->host == NULL) {
1802             continue;
1803         }
1804         if (host - block->host < block->max_length) {
1805             goto found;
1806         }
1807     }
1808
1809     rcu_read_unlock();
1810     return NULL;
1811
1812 found:
1813     *ram_addr = block->offset + (host - block->host);
1814     mr = block->mr;
1815     rcu_read_unlock();
1816     return mr;
1817 }
1818
1819 static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
1820                                uint64_t val, unsigned size)
1821 {
1822     if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
1823         tb_invalidate_phys_page_fast(ram_addr, size);
1824     }
1825     switch (size) {
1826     case 1:
1827         stb_p(qemu_get_ram_ptr(ram_addr), val);
1828         break;
1829     case 2:
1830         stw_p(qemu_get_ram_ptr(ram_addr), val);
1831         break;
1832     case 4:
1833         stl_p(qemu_get_ram_ptr(ram_addr), val);
1834         break;
1835     default:
1836         abort();
1837     }
1838     cpu_physical_memory_set_dirty_range_nocode(ram_addr, size);
1839     /* we remove the notdirty callback only if the code has been
1840        flushed */
1841     if (!cpu_physical_memory_is_clean(ram_addr)) {
1842         CPUArchState *env = current_cpu->env_ptr;
1843         tlb_set_dirty(env, current_cpu->mem_io_vaddr);
1844     }
1845 }
1846
1847 static bool notdirty_mem_accepts(void *opaque, hwaddr addr,
1848                                  unsigned size, bool is_write)
1849 {
1850     return is_write;
1851 }
1852
1853 static const MemoryRegionOps notdirty_mem_ops = {
1854     .write = notdirty_mem_write,
1855     .valid.accepts = notdirty_mem_accepts,
1856     .endianness = DEVICE_NATIVE_ENDIAN,
1857 };
1858
1859 /* Generate a debug exception if a watchpoint has been hit.  */
1860 static void check_watchpoint(int offset, int len, int flags)
1861 {
1862     CPUState *cpu = current_cpu;
1863     CPUArchState *env = cpu->env_ptr;
1864     target_ulong pc, cs_base;
1865     target_ulong vaddr;
1866     CPUWatchpoint *wp;
1867     int cpu_flags;
1868
1869     if (cpu->watchpoint_hit) {
1870         /* We re-entered the check after replacing the TB. Now raise
1871          * the debug interrupt so that is will trigger after the
1872          * current instruction. */
1873         cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
1874         return;
1875     }
1876     vaddr = (cpu->mem_io_vaddr & TARGET_PAGE_MASK) + offset;
1877     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
1878         if (cpu_watchpoint_address_matches(wp, vaddr, len)
1879             && (wp->flags & flags)) {
1880             if (flags == BP_MEM_READ) {
1881                 wp->flags |= BP_WATCHPOINT_HIT_READ;
1882             } else {
1883                 wp->flags |= BP_WATCHPOINT_HIT_WRITE;
1884             }
1885             wp->hitaddr = vaddr;
1886             if (!cpu->watchpoint_hit) {
1887                 cpu->watchpoint_hit = wp;
1888                 tb_check_watchpoint(cpu);
1889                 if (wp->flags & BP_STOP_BEFORE_ACCESS) {
1890                     cpu->exception_index = EXCP_DEBUG;
1891                     cpu_loop_exit(cpu);
1892                 } else {
1893                     cpu_get_tb_cpu_state(env, &pc, &cs_base, &cpu_flags);
1894                     tb_gen_code(cpu, pc, cs_base, cpu_flags, 1);
1895                     cpu_resume_from_signal(cpu, NULL);
1896                 }
1897             }
1898         } else {
1899             wp->flags &= ~BP_WATCHPOINT_HIT;
1900         }
1901     }
1902 }
1903
1904 /* Watchpoint access routines.  Watchpoints are inserted using TLB tricks,
1905    so these check for a hit then pass through to the normal out-of-line
1906    phys routines.  */
1907 static uint64_t watch_mem_read(void *opaque, hwaddr addr,
1908                                unsigned size)
1909 {
1910     check_watchpoint(addr & ~TARGET_PAGE_MASK, size, BP_MEM_READ);
1911     switch (size) {
1912     case 1: return ldub_phys(&address_space_memory, addr);
1913     case 2: return lduw_phys(&address_space_memory, addr);
1914     case 4: return ldl_phys(&address_space_memory, addr);
1915     default: abort();
1916     }
1917 }
1918
1919 static void watch_mem_write(void *opaque, hwaddr addr,
1920                             uint64_t val, unsigned size)
1921 {
1922     check_watchpoint(addr & ~TARGET_PAGE_MASK, size, BP_MEM_WRITE);
1923     switch (size) {
1924     case 1:
1925         stb_phys(&address_space_memory, addr, val);
1926         break;
1927     case 2:
1928         stw_phys(&address_space_memory, addr, val);
1929         break;
1930     case 4:
1931         stl_phys(&address_space_memory, addr, val);
1932         break;
1933     default: abort();
1934     }
1935 }
1936
1937 static const MemoryRegionOps watch_mem_ops = {
1938     .read = watch_mem_read,
1939     .write = watch_mem_write,
1940     .endianness = DEVICE_NATIVE_ENDIAN,
1941 };
1942
1943 static uint64_t subpage_read(void *opaque, hwaddr addr,
1944                              unsigned len)
1945 {
1946     subpage_t *subpage = opaque;
1947     uint8_t buf[8];
1948
1949 #if defined(DEBUG_SUBPAGE)
1950     printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__,
1951            subpage, len, addr);
1952 #endif
1953     address_space_read(subpage->as, addr + subpage->base, buf, len);
1954     switch (len) {
1955     case 1:
1956         return ldub_p(buf);
1957     case 2:
1958         return lduw_p(buf);
1959     case 4:
1960         return ldl_p(buf);
1961     case 8:
1962         return ldq_p(buf);
1963     default:
1964         abort();
1965     }
1966 }
1967
1968 static void subpage_write(void *opaque, hwaddr addr,
1969                           uint64_t value, unsigned len)
1970 {
1971     subpage_t *subpage = opaque;
1972     uint8_t buf[8];
1973
1974 #if defined(DEBUG_SUBPAGE)
1975     printf("%s: subpage %p len %u addr " TARGET_FMT_plx
1976            " value %"PRIx64"\n",
1977            __func__, subpage, len, addr, value);
1978 #endif
1979     switch (len) {
1980     case 1:
1981         stb_p(buf, value);
1982         break;
1983     case 2:
1984         stw_p(buf, value);
1985         break;
1986     case 4:
1987         stl_p(buf, value);
1988         break;
1989     case 8:
1990         stq_p(buf, value);
1991         break;
1992     default:
1993         abort();
1994     }
1995     address_space_write(subpage->as, addr + subpage->base, buf, len);
1996 }
1997
1998 static bool subpage_accepts(void *opaque, hwaddr addr,
1999                             unsigned len, bool is_write)
2000 {
2001     subpage_t *subpage = opaque;
2002 #if defined(DEBUG_SUBPAGE)
2003     printf("%s: subpage %p %c len %u addr " TARGET_FMT_plx "\n",
2004            __func__, subpage, is_write ? 'w' : 'r', len, addr);
2005 #endif
2006
2007     return address_space_access_valid(subpage->as, addr + subpage->base,
2008                                       len, is_write);
2009 }
2010
2011 static const MemoryRegionOps subpage_ops = {
2012     .read = subpage_read,
2013     .write = subpage_write,
2014     .impl.min_access_size = 1,
2015     .impl.max_access_size = 8,
2016     .valid.min_access_size = 1,
2017     .valid.max_access_size = 8,
2018     .valid.accepts = subpage_accepts,
2019     .endianness = DEVICE_NATIVE_ENDIAN,
2020 };
2021
2022 static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
2023                              uint16_t section)
2024 {
2025     int idx, eidx;
2026
2027     if (start >= TARGET_PAGE_SIZE || end >= TARGET_PAGE_SIZE)
2028         return -1;
2029     idx = SUBPAGE_IDX(start);
2030     eidx = SUBPAGE_IDX(end);
2031 #if defined(DEBUG_SUBPAGE)
2032     printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n",
2033            __func__, mmio, start, end, idx, eidx, section);
2034 #endif
2035     for (; idx <= eidx; idx++) {
2036         mmio->sub_section[idx] = section;
2037     }
2038
2039     return 0;
2040 }
2041
2042 static subpage_t *subpage_init(AddressSpace *as, hwaddr base)
2043 {
2044     subpage_t *mmio;
2045
2046     mmio = g_malloc0(sizeof(subpage_t));
2047
2048     mmio->as = as;
2049     mmio->base = base;
2050     memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
2051                           NULL, TARGET_PAGE_SIZE);
2052     mmio->iomem.subpage = true;
2053 #if defined(DEBUG_SUBPAGE)
2054     printf("%s: %p base " TARGET_FMT_plx " len %08x\n", __func__,
2055            mmio, base, TARGET_PAGE_SIZE);
2056 #endif
2057     subpage_register(mmio, 0, TARGET_PAGE_SIZE-1, PHYS_SECTION_UNASSIGNED);
2058
2059     return mmio;
2060 }
2061
2062 static uint16_t dummy_section(PhysPageMap *map, AddressSpace *as,
2063                               MemoryRegion *mr)
2064 {
2065     assert(as);
2066     MemoryRegionSection section = {
2067         .address_space = as,
2068         .mr = mr,
2069         .offset_within_address_space = 0,
2070         .offset_within_region = 0,
2071         .size = int128_2_64(),
2072     };
2073
2074     return phys_section_add(map, &section);
2075 }
2076
2077 MemoryRegion *iotlb_to_region(CPUState *cpu, hwaddr index)
2078 {
2079     AddressSpaceDispatch *d = atomic_rcu_read(&cpu->memory_dispatch);
2080     MemoryRegionSection *sections = d->map.sections;
2081
2082     return sections[index & ~TARGET_PAGE_MASK].mr;
2083 }
2084
2085 static void io_mem_init(void)
2086 {
2087     memory_region_init_io(&io_mem_rom, NULL, &unassigned_mem_ops, NULL, NULL, UINT64_MAX);
2088     memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
2089                           NULL, UINT64_MAX);
2090     memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
2091                           NULL, UINT64_MAX);
2092     memory_region_init_io(&io_mem_watch, NULL, &watch_mem_ops, NULL,
2093                           NULL, UINT64_MAX);
2094 }
2095
2096 static void mem_begin(MemoryListener *listener)
2097 {
2098     AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
2099     AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
2100     uint16_t n;
2101
2102     n = dummy_section(&d->map, as, &io_mem_unassigned);
2103     assert(n == PHYS_SECTION_UNASSIGNED);
2104     n = dummy_section(&d->map, as, &io_mem_notdirty);
2105     assert(n == PHYS_SECTION_NOTDIRTY);
2106     n = dummy_section(&d->map, as, &io_mem_rom);
2107     assert(n == PHYS_SECTION_ROM);
2108     n = dummy_section(&d->map, as, &io_mem_watch);
2109     assert(n == PHYS_SECTION_WATCH);
2110
2111     d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
2112     d->as = as;
2113     as->next_dispatch = d;
2114 }
2115
2116 static void address_space_dispatch_free(AddressSpaceDispatch *d)
2117 {
2118     phys_sections_free(&d->map);
2119     g_free(d);
2120 }
2121
2122 static void mem_commit(MemoryListener *listener)
2123 {
2124     AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
2125     AddressSpaceDispatch *cur = as->dispatch;
2126     AddressSpaceDispatch *next = as->next_dispatch;
2127
2128     phys_page_compact_all(next, next->map.nodes_nb);
2129
2130     atomic_rcu_set(&as->dispatch, next);
2131     if (cur) {
2132         call_rcu(cur, address_space_dispatch_free, rcu);
2133     }
2134 }
2135
2136 static void tcg_commit(MemoryListener *listener)
2137 {
2138     CPUState *cpu;
2139
2140     /* since each CPU stores ram addresses in its TLB cache, we must
2141        reset the modified entries */
2142     /* XXX: slow ! */
2143     CPU_FOREACH(cpu) {
2144         /* FIXME: Disentangle the cpu.h circular files deps so we can
2145            directly get the right CPU from listener.  */
2146         if (cpu->tcg_as_listener != listener) {
2147             continue;
2148         }
2149         cpu_reload_memory_map(cpu);
2150     }
2151 }
2152
2153 static void core_log_global_start(MemoryListener *listener)
2154 {
2155     cpu_physical_memory_set_dirty_tracking(true);
2156 }
2157
2158 static void core_log_global_stop(MemoryListener *listener)
2159 {
2160     cpu_physical_memory_set_dirty_tracking(false);
2161 }
2162
2163 static MemoryListener core_memory_listener = {
2164     .log_global_start = core_log_global_start,
2165     .log_global_stop = core_log_global_stop,
2166     .priority = 1,
2167 };
2168
2169 void address_space_init_dispatch(AddressSpace *as)
2170 {
2171     as->dispatch = NULL;
2172     as->dispatch_listener = (MemoryListener) {
2173         .begin = mem_begin,
2174         .commit = mem_commit,
2175         .region_add = mem_add,
2176         .region_nop = mem_add,
2177         .priority = 0,
2178     };
2179     memory_listener_register(&as->dispatch_listener, as);
2180 }
2181
2182 void address_space_unregister(AddressSpace *as)
2183 {
2184     memory_listener_unregister(&as->dispatch_listener);
2185 }
2186
2187 void address_space_destroy_dispatch(AddressSpace *as)
2188 {
2189     AddressSpaceDispatch *d = as->dispatch;
2190
2191     atomic_rcu_set(&as->dispatch, NULL);
2192     if (d) {
2193         call_rcu(d, address_space_dispatch_free, rcu);
2194     }
2195 }
2196
2197 static void memory_map_init(void)
2198 {
2199     system_memory = g_malloc(sizeof(*system_memory));
2200
2201     memory_region_init(system_memory, NULL, "system", UINT64_MAX);
2202     address_space_init(&address_space_memory, system_memory, "memory");
2203
2204     system_io = g_malloc(sizeof(*system_io));
2205     memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
2206                           65536);
2207     address_space_init(&address_space_io, system_io, "I/O");
2208
2209     memory_listener_register(&core_memory_listener, &address_space_memory);
2210 }
2211
2212 MemoryRegion *get_system_memory(void)
2213 {
2214     return system_memory;
2215 }
2216
2217 MemoryRegion *get_system_io(void)
2218 {
2219     return system_io;
2220 }
2221
2222 #endif /* !defined(CONFIG_USER_ONLY) */
2223
2224 /* physical memory access (slow version, mainly for debug) */
2225 #if defined(CONFIG_USER_ONLY)
2226 int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
2227                         uint8_t *buf, int len, int is_write)
2228 {
2229     int l, flags;
2230     target_ulong page;
2231     void * p;
2232
2233     while (len > 0) {
2234         page = addr & TARGET_PAGE_MASK;
2235         l = (page + TARGET_PAGE_SIZE) - addr;
2236         if (l > len)
2237             l = len;
2238         flags = page_get_flags(page);
2239         if (!(flags & PAGE_VALID))
2240             return -1;
2241         if (is_write) {
2242             if (!(flags & PAGE_WRITE))
2243                 return -1;
2244             /* XXX: this code should not depend on lock_user */
2245             if (!(p = lock_user(VERIFY_WRITE, addr, l, 0)))
2246                 return -1;
2247             memcpy(p, buf, l);
2248             unlock_user(p, addr, l);
2249         } else {
2250             if (!(flags & PAGE_READ))
2251                 return -1;
2252             /* XXX: this code should not depend on lock_user */
2253             if (!(p = lock_user(VERIFY_READ, addr, l, 1)))
2254                 return -1;
2255             memcpy(buf, p, l);
2256             unlock_user(p, addr, 0);
2257         }
2258         len -= l;
2259         buf += l;
2260         addr += l;
2261     }
2262     return 0;
2263 }
2264
2265 #else
2266
2267 static void invalidate_and_set_dirty(hwaddr addr,
2268                                      hwaddr length)
2269 {
2270     if (cpu_physical_memory_range_includes_clean(addr, length)) {
2271         tb_invalidate_phys_range(addr, addr + length, 0);
2272         cpu_physical_memory_set_dirty_range_nocode(addr, length);
2273     }
2274     xen_modified_memory(addr, length);
2275 }
2276
2277 static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
2278 {
2279     unsigned access_size_max = mr->ops->valid.max_access_size;
2280
2281     /* Regions are assumed to support 1-4 byte accesses unless
2282        otherwise specified.  */
2283     if (access_size_max == 0) {
2284         access_size_max = 4;
2285     }
2286
2287     /* Bound the maximum access by the alignment of the address.  */
2288     if (!mr->ops->impl.unaligned) {
2289         unsigned align_size_max = addr & -addr;
2290         if (align_size_max != 0 && align_size_max < access_size_max) {
2291             access_size_max = align_size_max;
2292         }
2293     }
2294
2295     /* Don't attempt accesses larger than the maximum.  */
2296     if (l > access_size_max) {
2297         l = access_size_max;
2298     }
2299     if (l & (l - 1)) {
2300         l = 1 << (qemu_fls(l) - 1);
2301     }
2302
2303     return l;
2304 }
2305
2306 bool address_space_rw(AddressSpace *as, hwaddr addr, uint8_t *buf,
2307                       int len, bool is_write)
2308 {
2309     hwaddr l;
2310     uint8_t *ptr;
2311     uint64_t val;
2312     hwaddr addr1;
2313     MemoryRegion *mr;
2314     bool error = false;
2315
2316     while (len > 0) {
2317         l = len;
2318         mr = address_space_translate(as, addr, &addr1, &l, is_write);
2319
2320         if (is_write) {
2321             if (!memory_access_is_direct(mr, is_write)) {
2322                 l = memory_access_size(mr, l, addr1);
2323                 /* XXX: could force current_cpu to NULL to avoid
2324                    potential bugs */
2325                 switch (l) {
2326                 case 8:
2327                     /* 64 bit write access */
2328                     val = ldq_p(buf);
2329                     error |= io_mem_write(mr, addr1, val, 8);
2330                     break;
2331                 case 4:
2332                     /* 32 bit write access */
2333                     val = ldl_p(buf);
2334                     error |= io_mem_write(mr, addr1, val, 4);
2335                     break;
2336                 case 2:
2337                     /* 16 bit write access */
2338                     val = lduw_p(buf);
2339                     error |= io_mem_write(mr, addr1, val, 2);
2340                     break;
2341                 case 1:
2342                     /* 8 bit write access */
2343                     val = ldub_p(buf);
2344                     error |= io_mem_write(mr, addr1, val, 1);
2345                     break;
2346                 default:
2347                     abort();
2348                 }
2349             } else {
2350                 addr1 += memory_region_get_ram_addr(mr);
2351                 /* RAM case */
2352                 ptr = qemu_get_ram_ptr(addr1);
2353                 memcpy(ptr, buf, l);
2354                 invalidate_and_set_dirty(addr1, l);
2355             }
2356         } else {
2357             if (!memory_access_is_direct(mr, is_write)) {
2358                 /* I/O case */
2359                 l = memory_access_size(mr, l, addr1);
2360                 switch (l) {
2361                 case 8:
2362                     /* 64 bit read access */
2363                     error |= io_mem_read(mr, addr1, &val, 8);
2364                     stq_p(buf, val);
2365                     break;
2366                 case 4:
2367                     /* 32 bit read access */
2368                     error |= io_mem_read(mr, addr1, &val, 4);
2369                     stl_p(buf, val);
2370                     break;
2371                 case 2:
2372                     /* 16 bit read access */
2373                     error |= io_mem_read(mr, addr1, &val, 2);
2374                     stw_p(buf, val);
2375                     break;
2376                 case 1:
2377                     /* 8 bit read access */
2378                     error |= io_mem_read(mr, addr1, &val, 1);
2379                     stb_p(buf, val);
2380                     break;
2381                 default:
2382                     abort();
2383                 }
2384             } else {
2385                 /* RAM case */
2386                 ptr = qemu_get_ram_ptr(mr->ram_addr + addr1);
2387                 memcpy(buf, ptr, l);
2388             }
2389         }
2390         len -= l;
2391         buf += l;
2392         addr += l;
2393     }
2394
2395     return error;
2396 }
2397
2398 bool address_space_write(AddressSpace *as, hwaddr addr,
2399                          const uint8_t *buf, int len)
2400 {
2401     return address_space_rw(as, addr, (uint8_t *)buf, len, true);
2402 }
2403
2404 bool address_space_read(AddressSpace *as, hwaddr addr, uint8_t *buf, int len)
2405 {
2406     return address_space_rw(as, addr, buf, len, false);
2407 }
2408
2409
2410 void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
2411                             int len, int is_write)
2412 {
2413     address_space_rw(&address_space_memory, addr, buf, len, is_write);
2414 }
2415
2416 enum write_rom_type {
2417     WRITE_DATA,
2418     FLUSH_CACHE,
2419 };
2420
2421 static inline void cpu_physical_memory_write_rom_internal(AddressSpace *as,
2422     hwaddr addr, const uint8_t *buf, int len, enum write_rom_type type)
2423 {
2424     hwaddr l;
2425     uint8_t *ptr;
2426     hwaddr addr1;
2427     MemoryRegion *mr;
2428
2429     while (len > 0) {
2430         l = len;
2431         mr = address_space_translate(as, addr, &addr1, &l, true);
2432
2433         if (!(memory_region_is_ram(mr) ||
2434               memory_region_is_romd(mr))) {
2435             /* do nothing */
2436         } else {
2437             addr1 += memory_region_get_ram_addr(mr);
2438             /* ROM/RAM case */
2439             ptr = qemu_get_ram_ptr(addr1);
2440             switch (type) {
2441             case WRITE_DATA:
2442                 memcpy(ptr, buf, l);
2443                 invalidate_and_set_dirty(addr1, l);
2444                 break;
2445             case FLUSH_CACHE:
2446                 flush_icache_range((uintptr_t)ptr, (uintptr_t)ptr + l);
2447                 break;
2448             }
2449         }
2450         len -= l;
2451         buf += l;
2452         addr += l;
2453     }
2454 }
2455
2456 /* used for ROM loading : can write in RAM and ROM */
2457 void cpu_physical_memory_write_rom(AddressSpace *as, hwaddr addr,
2458                                    const uint8_t *buf, int len)
2459 {
2460     cpu_physical_memory_write_rom_internal(as, addr, buf, len, WRITE_DATA);
2461 }
2462
2463 void cpu_flush_icache_range(hwaddr start, int len)
2464 {
2465     /*
2466      * This function should do the same thing as an icache flush that was
2467      * triggered from within the guest. For TCG we are always cache coherent,
2468      * so there is no need to flush anything. For KVM / Xen we need to flush
2469      * the host's instruction cache at least.
2470      */
2471     if (tcg_enabled()) {
2472         return;
2473     }
2474
2475     cpu_physical_memory_write_rom_internal(&address_space_memory,
2476                                            start, NULL, len, FLUSH_CACHE);
2477 }
2478
2479 typedef struct {
2480     MemoryRegion *mr;
2481     void *buffer;
2482     hwaddr addr;
2483     hwaddr len;
2484 } BounceBuffer;
2485
2486 static BounceBuffer bounce;
2487
2488 typedef struct MapClient {
2489     void *opaque;
2490     void (*callback)(void *opaque);
2491     QLIST_ENTRY(MapClient) link;
2492 } MapClient;
2493
2494 static QLIST_HEAD(map_client_list, MapClient) map_client_list
2495     = QLIST_HEAD_INITIALIZER(map_client_list);
2496
2497 void *cpu_register_map_client(void *opaque, void (*callback)(void *opaque))
2498 {
2499     MapClient *client = g_malloc(sizeof(*client));
2500
2501     client->opaque = opaque;
2502     client->callback = callback;
2503     QLIST_INSERT_HEAD(&map_client_list, client, link);
2504     return client;
2505 }
2506
2507 static void cpu_unregister_map_client(void *_client)
2508 {
2509     MapClient *client = (MapClient *)_client;
2510
2511     QLIST_REMOVE(client, link);
2512     g_free(client);
2513 }
2514
2515 static void cpu_notify_map_clients(void)
2516 {
2517     MapClient *client;
2518
2519     while (!QLIST_EMPTY(&map_client_list)) {
2520         client = QLIST_FIRST(&map_client_list);
2521         client->callback(client->opaque);
2522         cpu_unregister_map_client(client);
2523     }
2524 }
2525
2526 bool address_space_access_valid(AddressSpace *as, hwaddr addr, int len, bool is_write)
2527 {
2528     MemoryRegion *mr;
2529     hwaddr l, xlat;
2530
2531     while (len > 0) {
2532         l = len;
2533         mr = address_space_translate(as, addr, &xlat, &l, is_write);
2534         if (!memory_access_is_direct(mr, is_write)) {
2535             l = memory_access_size(mr, l, addr);
2536             if (!memory_region_access_valid(mr, xlat, l, is_write)) {
2537                 return false;
2538             }
2539         }
2540
2541         len -= l;
2542         addr += l;
2543     }
2544     return true;
2545 }
2546
2547 /* Map a physical memory region into a host virtual address.
2548  * May map a subset of the requested range, given by and returned in *plen.
2549  * May return NULL if resources needed to perform the mapping are exhausted.
2550  * Use only for reads OR writes - not for read-modify-write operations.
2551  * Use cpu_register_map_client() to know when retrying the map operation is
2552  * likely to succeed.
2553  */
2554 void *address_space_map(AddressSpace *as,
2555                         hwaddr addr,
2556                         hwaddr *plen,
2557                         bool is_write)
2558 {
2559     hwaddr len = *plen;
2560     hwaddr done = 0;
2561     hwaddr l, xlat, base;
2562     MemoryRegion *mr, *this_mr;
2563     ram_addr_t raddr;
2564
2565     if (len == 0) {
2566         return NULL;
2567     }
2568
2569     l = len;
2570     mr = address_space_translate(as, addr, &xlat, &l, is_write);
2571     if (!memory_access_is_direct(mr, is_write)) {
2572         if (bounce.buffer) {
2573             return NULL;
2574         }
2575         /* Avoid unbounded allocations */
2576         l = MIN(l, TARGET_PAGE_SIZE);
2577         bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
2578         bounce.addr = addr;
2579         bounce.len = l;
2580
2581         memory_region_ref(mr);
2582         bounce.mr = mr;
2583         if (!is_write) {
2584             address_space_read(as, addr, bounce.buffer, l);
2585         }
2586
2587         *plen = l;
2588         return bounce.buffer;
2589     }
2590
2591     base = xlat;
2592     raddr = memory_region_get_ram_addr(mr);
2593
2594     for (;;) {
2595         len -= l;
2596         addr += l;
2597         done += l;
2598         if (len == 0) {
2599             break;
2600         }
2601
2602         l = len;
2603         this_mr = address_space_translate(as, addr, &xlat, &l, is_write);
2604         if (this_mr != mr || xlat != base + done) {
2605             break;
2606         }
2607     }
2608
2609     memory_region_ref(mr);
2610     *plen = done;
2611     return qemu_ram_ptr_length(raddr + base, plen);
2612 }
2613
2614 /* Unmaps a memory region previously mapped by address_space_map().
2615  * Will also mark the memory as dirty if is_write == 1.  access_len gives
2616  * the amount of memory that was actually read or written by the caller.
2617  */
2618 void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
2619                          int is_write, hwaddr access_len)
2620 {
2621     if (buffer != bounce.buffer) {
2622         MemoryRegion *mr;
2623         ram_addr_t addr1;
2624
2625         mr = qemu_ram_addr_from_host(buffer, &addr1);
2626         assert(mr != NULL);
2627         if (is_write) {
2628             invalidate_and_set_dirty(addr1, access_len);
2629         }
2630         if (xen_enabled()) {
2631             xen_invalidate_map_cache_entry(buffer);
2632         }
2633         memory_region_unref(mr);
2634         return;
2635     }
2636     if (is_write) {
2637         address_space_write(as, bounce.addr, bounce.buffer, access_len);
2638     }
2639     qemu_vfree(bounce.buffer);
2640     bounce.buffer = NULL;
2641     memory_region_unref(bounce.mr);
2642     cpu_notify_map_clients();
2643 }
2644
2645 void *cpu_physical_memory_map(hwaddr addr,
2646                               hwaddr *plen,
2647                               int is_write)
2648 {
2649     return address_space_map(&address_space_memory, addr, plen, is_write);
2650 }
2651
2652 void cpu_physical_memory_unmap(void *buffer, hwaddr len,
2653                                int is_write, hwaddr access_len)
2654 {
2655     return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
2656 }
2657
2658 /* warning: addr must be aligned */
2659 static inline uint32_t ldl_phys_internal(AddressSpace *as, hwaddr addr,
2660                                          enum device_endian endian)
2661 {
2662     uint8_t *ptr;
2663     uint64_t val;
2664     MemoryRegion *mr;
2665     hwaddr l = 4;
2666     hwaddr addr1;
2667
2668     mr = address_space_translate(as, addr, &addr1, &l, false);
2669     if (l < 4 || !memory_access_is_direct(mr, false)) {
2670         /* I/O case */
2671         io_mem_read(mr, addr1, &val, 4);
2672 #if defined(TARGET_WORDS_BIGENDIAN)
2673         if (endian == DEVICE_LITTLE_ENDIAN) {
2674             val = bswap32(val);
2675         }
2676 #else
2677         if (endian == DEVICE_BIG_ENDIAN) {
2678             val = bswap32(val);
2679         }
2680 #endif
2681     } else {
2682         /* RAM case */
2683         ptr = qemu_get_ram_ptr((memory_region_get_ram_addr(mr)
2684                                 & TARGET_PAGE_MASK)
2685                                + addr1);
2686         switch (endian) {
2687         case DEVICE_LITTLE_ENDIAN:
2688             val = ldl_le_p(ptr);
2689             break;
2690         case DEVICE_BIG_ENDIAN:
2691             val = ldl_be_p(ptr);
2692             break;
2693         default:
2694             val = ldl_p(ptr);
2695             break;
2696         }
2697     }
2698     return val;
2699 }
2700
2701 uint32_t ldl_phys(AddressSpace *as, hwaddr addr)
2702 {
2703     return ldl_phys_internal(as, addr, DEVICE_NATIVE_ENDIAN);
2704 }
2705
2706 uint32_t ldl_le_phys(AddressSpace *as, hwaddr addr)
2707 {
2708     return ldl_phys_internal(as, addr, DEVICE_LITTLE_ENDIAN);
2709 }
2710
2711 uint32_t ldl_be_phys(AddressSpace *as, hwaddr addr)
2712 {
2713     return ldl_phys_internal(as, addr, DEVICE_BIG_ENDIAN);
2714 }
2715
2716 /* warning: addr must be aligned */
2717 static inline uint64_t ldq_phys_internal(AddressSpace *as, hwaddr addr,
2718                                          enum device_endian endian)
2719 {
2720     uint8_t *ptr;
2721     uint64_t val;
2722     MemoryRegion *mr;
2723     hwaddr l = 8;
2724     hwaddr addr1;
2725
2726     mr = address_space_translate(as, addr, &addr1, &l,
2727                                  false);
2728     if (l < 8 || !memory_access_is_direct(mr, false)) {
2729         /* I/O case */
2730         io_mem_read(mr, addr1, &val, 8);
2731 #if defined(TARGET_WORDS_BIGENDIAN)
2732         if (endian == DEVICE_LITTLE_ENDIAN) {
2733             val = bswap64(val);
2734         }
2735 #else
2736         if (endian == DEVICE_BIG_ENDIAN) {
2737             val = bswap64(val);
2738         }
2739 #endif
2740     } else {
2741         /* RAM case */
2742         ptr = qemu_get_ram_ptr((memory_region_get_ram_addr(mr)
2743                                 & TARGET_PAGE_MASK)
2744                                + addr1);
2745         switch (endian) {
2746         case DEVICE_LITTLE_ENDIAN:
2747             val = ldq_le_p(ptr);
2748             break;
2749         case DEVICE_BIG_ENDIAN:
2750             val = ldq_be_p(ptr);
2751             break;
2752         default:
2753             val = ldq_p(ptr);
2754             break;
2755         }
2756     }
2757     return val;
2758 }
2759
2760 uint64_t ldq_phys(AddressSpace *as, hwaddr addr)
2761 {
2762     return ldq_phys_internal(as, addr, DEVICE_NATIVE_ENDIAN);
2763 }
2764
2765 uint64_t ldq_le_phys(AddressSpace *as, hwaddr addr)
2766 {
2767     return ldq_phys_internal(as, addr, DEVICE_LITTLE_ENDIAN);
2768 }
2769
2770 uint64_t ldq_be_phys(AddressSpace *as, hwaddr addr)
2771 {
2772     return ldq_phys_internal(as, addr, DEVICE_BIG_ENDIAN);
2773 }
2774
2775 /* XXX: optimize */
2776 uint32_t ldub_phys(AddressSpace *as, hwaddr addr)
2777 {
2778     uint8_t val;
2779     address_space_rw(as, addr, &val, 1, 0);
2780     return val;
2781 }
2782
2783 /* warning: addr must be aligned */
2784 static inline uint32_t lduw_phys_internal(AddressSpace *as, hwaddr addr,
2785                                           enum device_endian endian)
2786 {
2787     uint8_t *ptr;
2788     uint64_t val;
2789     MemoryRegion *mr;
2790     hwaddr l = 2;
2791     hwaddr addr1;
2792
2793     mr = address_space_translate(as, addr, &addr1, &l,
2794                                  false);
2795     if (l < 2 || !memory_access_is_direct(mr, false)) {
2796         /* I/O case */
2797         io_mem_read(mr, addr1, &val, 2);
2798 #if defined(TARGET_WORDS_BIGENDIAN)
2799         if (endian == DEVICE_LITTLE_ENDIAN) {
2800             val = bswap16(val);
2801         }
2802 #else
2803         if (endian == DEVICE_BIG_ENDIAN) {
2804             val = bswap16(val);
2805         }
2806 #endif
2807     } else {
2808         /* RAM case */
2809         ptr = qemu_get_ram_ptr((memory_region_get_ram_addr(mr)
2810                                 & TARGET_PAGE_MASK)
2811                                + addr1);
2812         switch (endian) {
2813         case DEVICE_LITTLE_ENDIAN:
2814             val = lduw_le_p(ptr);
2815             break;
2816         case DEVICE_BIG_ENDIAN:
2817             val = lduw_be_p(ptr);
2818             break;
2819         default:
2820             val = lduw_p(ptr);
2821             break;
2822         }
2823     }
2824     return val;
2825 }
2826
2827 uint32_t lduw_phys(AddressSpace *as, hwaddr addr)
2828 {
2829     return lduw_phys_internal(as, addr, DEVICE_NATIVE_ENDIAN);
2830 }
2831
2832 uint32_t lduw_le_phys(AddressSpace *as, hwaddr addr)
2833 {
2834     return lduw_phys_internal(as, addr, DEVICE_LITTLE_ENDIAN);
2835 }
2836
2837 uint32_t lduw_be_phys(AddressSpace *as, hwaddr addr)
2838 {
2839     return lduw_phys_internal(as, addr, DEVICE_BIG_ENDIAN);
2840 }
2841
2842 /* warning: addr must be aligned. The ram page is not masked as dirty
2843    and the code inside is not invalidated. It is useful if the dirty
2844    bits are used to track modified PTEs */
2845 void stl_phys_notdirty(AddressSpace *as, hwaddr addr, uint32_t val)
2846 {
2847     uint8_t *ptr;
2848     MemoryRegion *mr;
2849     hwaddr l = 4;
2850     hwaddr addr1;
2851
2852     mr = address_space_translate(as, addr, &addr1, &l,
2853                                  true);
2854     if (l < 4 || !memory_access_is_direct(mr, true)) {
2855         io_mem_write(mr, addr1, val, 4);
2856     } else {
2857         addr1 += memory_region_get_ram_addr(mr) & TARGET_PAGE_MASK;
2858         ptr = qemu_get_ram_ptr(addr1);
2859         stl_p(ptr, val);
2860
2861         if (unlikely(in_migration)) {
2862             if (cpu_physical_memory_is_clean(addr1)) {
2863                 /* invalidate code */
2864                 tb_invalidate_phys_page_range(addr1, addr1 + 4, 0);
2865                 /* set dirty bit */
2866                 cpu_physical_memory_set_dirty_range_nocode(addr1, 4);
2867             }
2868         }
2869     }
2870 }
2871
2872 /* warning: addr must be aligned */
2873 static inline void stl_phys_internal(AddressSpace *as,
2874                                      hwaddr addr, uint32_t val,
2875                                      enum device_endian endian)
2876 {
2877     uint8_t *ptr;
2878     MemoryRegion *mr;
2879     hwaddr l = 4;
2880     hwaddr addr1;
2881
2882     mr = address_space_translate(as, addr, &addr1, &l,
2883                                  true);
2884     if (l < 4 || !memory_access_is_direct(mr, true)) {
2885 #if defined(TARGET_WORDS_BIGENDIAN)
2886         if (endian == DEVICE_LITTLE_ENDIAN) {
2887             val = bswap32(val);
2888         }
2889 #else
2890         if (endian == DEVICE_BIG_ENDIAN) {
2891             val = bswap32(val);
2892         }
2893 #endif
2894         io_mem_write(mr, addr1, val, 4);
2895     } else {
2896         /* RAM case */
2897         addr1 += memory_region_get_ram_addr(mr) & TARGET_PAGE_MASK;
2898         ptr = qemu_get_ram_ptr(addr1);
2899         switch (endian) {
2900         case DEVICE_LITTLE_ENDIAN:
2901             stl_le_p(ptr, val);
2902             break;
2903         case DEVICE_BIG_ENDIAN:
2904             stl_be_p(ptr, val);
2905             break;
2906         default:
2907             stl_p(ptr, val);
2908             break;
2909         }
2910         invalidate_and_set_dirty(addr1, 4);
2911     }
2912 }
2913
2914 void stl_phys(AddressSpace *as, hwaddr addr, uint32_t val)
2915 {
2916     stl_phys_internal(as, addr, val, DEVICE_NATIVE_ENDIAN);
2917 }
2918
2919 void stl_le_phys(AddressSpace *as, hwaddr addr, uint32_t val)
2920 {
2921     stl_phys_internal(as, addr, val, DEVICE_LITTLE_ENDIAN);
2922 }
2923
2924 void stl_be_phys(AddressSpace *as, hwaddr addr, uint32_t val)
2925 {
2926     stl_phys_internal(as, addr, val, DEVICE_BIG_ENDIAN);
2927 }
2928
2929 /* XXX: optimize */
2930 void stb_phys(AddressSpace *as, hwaddr addr, uint32_t val)
2931 {
2932     uint8_t v = val;
2933     address_space_rw(as, addr, &v, 1, 1);
2934 }
2935
2936 /* warning: addr must be aligned */
2937 static inline void stw_phys_internal(AddressSpace *as,
2938                                      hwaddr addr, uint32_t val,
2939                                      enum device_endian endian)
2940 {
2941     uint8_t *ptr;
2942     MemoryRegion *mr;
2943     hwaddr l = 2;
2944     hwaddr addr1;
2945
2946     mr = address_space_translate(as, addr, &addr1, &l, true);
2947     if (l < 2 || !memory_access_is_direct(mr, true)) {
2948 #if defined(TARGET_WORDS_BIGENDIAN)
2949         if (endian == DEVICE_LITTLE_ENDIAN) {
2950             val = bswap16(val);
2951         }
2952 #else
2953         if (endian == DEVICE_BIG_ENDIAN) {
2954             val = bswap16(val);
2955         }
2956 #endif
2957         io_mem_write(mr, addr1, val, 2);
2958     } else {
2959         /* RAM case */
2960         addr1 += memory_region_get_ram_addr(mr) & TARGET_PAGE_MASK;
2961         ptr = qemu_get_ram_ptr(addr1);
2962         switch (endian) {
2963         case DEVICE_LITTLE_ENDIAN:
2964             stw_le_p(ptr, val);
2965             break;
2966         case DEVICE_BIG_ENDIAN:
2967             stw_be_p(ptr, val);
2968             break;
2969         default:
2970             stw_p(ptr, val);
2971             break;
2972         }
2973         invalidate_and_set_dirty(addr1, 2);
2974     }
2975 }
2976
2977 void stw_phys(AddressSpace *as, hwaddr addr, uint32_t val)
2978 {
2979     stw_phys_internal(as, addr, val, DEVICE_NATIVE_ENDIAN);
2980 }
2981
2982 void stw_le_phys(AddressSpace *as, hwaddr addr, uint32_t val)
2983 {
2984     stw_phys_internal(as, addr, val, DEVICE_LITTLE_ENDIAN);
2985 }
2986
2987 void stw_be_phys(AddressSpace *as, hwaddr addr, uint32_t val)
2988 {
2989     stw_phys_internal(as, addr, val, DEVICE_BIG_ENDIAN);
2990 }
2991
2992 /* XXX: optimize */
2993 void stq_phys(AddressSpace *as, hwaddr addr, uint64_t val)
2994 {
2995     val = tswap64(val);
2996     address_space_rw(as, addr, (void *) &val, 8, 1);
2997 }
2998
2999 void stq_le_phys(AddressSpace *as, hwaddr addr, uint64_t val)
3000 {
3001     val = cpu_to_le64(val);
3002     address_space_rw(as, addr, (void *) &val, 8, 1);
3003 }
3004
3005 void stq_be_phys(AddressSpace *as, hwaddr addr, uint64_t val)
3006 {
3007     val = cpu_to_be64(val);
3008     address_space_rw(as, addr, (void *) &val, 8, 1);
3009 }
3010
3011 /* virtual memory access for debug (includes writing to ROM) */
3012 int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
3013                         uint8_t *buf, int len, int is_write)
3014 {
3015     int l;
3016     hwaddr phys_addr;
3017     target_ulong page;
3018
3019     while (len > 0) {
3020         page = addr & TARGET_PAGE_MASK;
3021         phys_addr = cpu_get_phys_page_debug(cpu, page);
3022         /* if no physical page mapped, return an error */
3023         if (phys_addr == -1)
3024             return -1;
3025         l = (page + TARGET_PAGE_SIZE) - addr;
3026         if (l > len)
3027             l = len;
3028         phys_addr += (addr & ~TARGET_PAGE_MASK);
3029         if (is_write) {
3030             cpu_physical_memory_write_rom(cpu->as, phys_addr, buf, l);
3031         } else {
3032             address_space_rw(cpu->as, phys_addr, buf, l, 0);
3033         }
3034         len -= l;
3035         buf += l;
3036         addr += l;
3037     }
3038     return 0;
3039 }
3040 #endif
3041
3042 /*
3043  * A helper function for the _utterly broken_ virtio device model to find out if
3044  * it's running on a big endian machine. Don't do this at home kids!
3045  */
3046 bool target_words_bigendian(void);
3047 bool target_words_bigendian(void)
3048 {
3049 #if defined(TARGET_WORDS_BIGENDIAN)
3050     return true;
3051 #else
3052     return false;
3053 #endif
3054 }
3055
3056 #ifndef CONFIG_USER_ONLY
3057 bool cpu_physical_memory_is_io(hwaddr phys_addr)
3058 {
3059     MemoryRegion*mr;
3060     hwaddr l = 1;
3061
3062     mr = address_space_translate(&address_space_memory,
3063                                  phys_addr, &phys_addr, &l, false);
3064
3065     return !(memory_region_is_ram(mr) ||
3066              memory_region_is_romd(mr));
3067 }
3068
3069 void qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
3070 {
3071     RAMBlock *block;
3072
3073     rcu_read_lock();
3074     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
3075         func(block->host, block->offset, block->used_length, opaque);
3076     }
3077     rcu_read_unlock();
3078 }
3079 #endif