exec.c

   1 /*
   2  *  Virtual page mapping
   3  *
   4  *  Copyright (c) 2003 Fabrice Bellard
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19
  20 #include "qemu/osdep.h"
  21 #include "qemu-common.h"
  22 #include "qapi/error.h"
  23
  24 #include "qemu/cutils.h"
  25 #include "cpu.h"
  26 #include "exec/exec-all.h"
  27 #include "exec/target_page.h"
  28 #include "tcg.h"
  29 #include "hw/qdev-core.h"
  30 #include "hw/qdev-properties.h"
  31 #if !defined(CONFIG_USER_ONLY)
  32 #include "hw/boards.h"
  33 #include "hw/xen/xen.h"
  34 #endif
  35 #include "sysemu/kvm.h"
  36 #include "sysemu/sysemu.h"
  37 #include "sysemu/tcg.h"
  38 #include "qemu/timer.h"
  39 #include "qemu/config-file.h"
  40 #include "qemu/error-report.h"
  41 #include "qemu/qemu-print.h"
  42 #if defined(CONFIG_USER_ONLY)
  43 #include "qemu.h"
  44 #else /* !CONFIG_USER_ONLY */
  45 #include "exec/memory.h"
  46 #include "exec/ioport.h"
  47 #include "sysemu/dma.h"
  48 #include "sysemu/hostmem.h"
  49 #include "sysemu/hw_accel.h"
  50 #include "exec/address-spaces.h"
  51 #include "sysemu/xen-mapcache.h"
  52 #include "trace-root.h"
  53
  54 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
  55 #include <linux/falloc.h>
  56 #endif
  57
  58 #endif
  59 #include "qemu/rcu_queue.h"
  60 #include "qemu/main-loop.h"
  61 #include "translate-all.h"
  62 #include "sysemu/replay.h"
  63
  64 #include "exec/memory-internal.h"
  65 #include "exec/ram_addr.h"
  66 #include "exec/log.h"
  67
  68 #include "migration/vmstate.h"
  69
  70 #include "qemu/range.h"
  71 #ifndef _WIN32
  72 #include "qemu/mmap-alloc.h"
  73 #endif
  74
  75 #include "monitor/monitor.h"
  76
  77 //#define DEBUG_SUBPAGE
  78
  79 #if !defined(CONFIG_USER_ONLY)
  80 /* ram_list is read under rcu_read_lock()/rcu_read_unlock().  Writes
  81  * are protected by the ramlist lock.
  82  */
  83 RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
  84
  85 static MemoryRegion *system_memory;
  86 static MemoryRegion *system_io;
  87
  88 AddressSpace address_space_io;
  89 AddressSpace address_space_memory;
  90
  91 static MemoryRegion io_mem_unassigned;
  92 #endif
  93
  94 CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
  95
  96 /* current CPU in the current thread. It is only valid inside
  97    cpu_exec() */
  98 __thread CPUState *current_cpu;
  99 /* 0 = Do not count executed instructions.
 100    1 = Precise instruction counting.
 101    2 = Adaptive rate instruction counting.  */
 102 int use_icount;
 103
 104 uintptr_t qemu_host_page_size;
 105 intptr_t qemu_host_page_mask;
 106
 107 #if !defined(CONFIG_USER_ONLY)
 108
 109 typedef struct PhysPageEntry PhysPageEntry;
 110
 111 struct PhysPageEntry {
 112     /* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
 113     uint32_t skip : 6;
 114      /* index into phys_sections (!skip) or phys_map_nodes (skip) */
 115     uint32_t ptr : 26;
 116 };
 117
 118 #define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)
 119
 120 /* Size of the L2 (and L3, etc) page tables.  */
 121 #define ADDR_SPACE_BITS 64
 122
 123 #define P_L2_BITS 9
 124 #define P_L2_SIZE (1 << P_L2_BITS)
 125
 126 #define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
 127
 128 typedef PhysPageEntry Node[P_L2_SIZE];
 129
 130 typedef struct PhysPageMap {
 131     struct rcu_head rcu;
 132
 133     unsigned sections_nb;
 134     unsigned sections_nb_alloc;
 135     unsigned nodes_nb;
 136     unsigned nodes_nb_alloc;
 137     Node *nodes;
 138     MemoryRegionSection *sections;
 139 } PhysPageMap;
 140
 141 struct AddressSpaceDispatch {
 142     MemoryRegionSection *mru_section;
 143     /* This is a multi-level map on the physical address space.
 144      * The bottom level has pointers to MemoryRegionSections.
 145      */
 146     PhysPageEntry phys_map;
 147     PhysPageMap map;
 148 };
 149
 150 #define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
 151 typedef struct subpage_t {
 152     MemoryRegion iomem;
 153     FlatView *fv;
 154     hwaddr base;
 155     uint16_t sub_section[];
 156 } subpage_t;
 157
 158 #define PHYS_SECTION_UNASSIGNED 0
 159
 160 static void io_mem_init(void);
 161 static void memory_map_init(void);
 162 static void tcg_log_global_after_sync(MemoryListener *listener);
 163 static void tcg_commit(MemoryListener *listener);
 164
 165 /**
 166  * CPUAddressSpace: all the information a CPU needs about an AddressSpace
 167  * @cpu: the CPU whose AddressSpace this is
 168  * @as: the AddressSpace itself
 169  * @memory_dispatch: its dispatch pointer (cached, RCU protected)
 170  * @tcg_as_listener: listener for tracking changes to the AddressSpace
 171  */
 172 struct CPUAddressSpace {
 173     CPUState *cpu;
 174     AddressSpace *as;
 175     struct AddressSpaceDispatch *memory_dispatch;
 176     MemoryListener tcg_as_listener;
 177 };
 178
 179 struct DirtyBitmapSnapshot {
 180     ram_addr_t start;
 181     ram_addr_t end;
 182     unsigned long dirty[];
 183 };
 184
 185 #endif
 186
 187 #if !defined(CONFIG_USER_ONLY)
 188
 189 static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
 190 {
 191     static unsigned alloc_hint = 16;
 192     if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
 193         map->nodes_nb_alloc = MAX(alloc_hint, map->nodes_nb + nodes);
 194         map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
 195         alloc_hint = map->nodes_nb_alloc;
 196     }
 197 }
 198
 199 static uint32_t phys_map_node_alloc(PhysPageMap *map, bool leaf)
 200 {
 201     unsigned i;
 202     uint32_t ret;
 203     PhysPageEntry e;
 204     PhysPageEntry *p;
 205
 206     ret = map->nodes_nb++;
 207     p = map->nodes[ret];
 208     assert(ret != PHYS_MAP_NODE_NIL);
 209     assert(ret != map->nodes_nb_alloc);
 210
 211     e.skip = leaf ? 0 : 1;
 212     e.ptr = leaf ? PHYS_SECTION_UNASSIGNED : PHYS_MAP_NODE_NIL;
 213     for (i = 0; i < P_L2_SIZE; ++i) {
 214         memcpy(&p[i], &e, sizeof(e));
 215     }
 216     return ret;
 217 }
 218
 219 static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp,
 220                                 hwaddr *index, uint64_t *nb, uint16_t leaf,
 221                                 int level)
 222 {
 223     PhysPageEntry *p;
 224     hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
 225
 226     if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
 227         lp->ptr = phys_map_node_alloc(map, level == 0);
 228     }
 229     p = map->nodes[lp->ptr];
 230     lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)];
 231
 232     while (*nb && lp < &p[P_L2_SIZE]) {
 233         if ((*index & (step - 1)) == 0 && *nb >= step) {
 234             lp->skip = 0;
 235             lp->ptr = leaf;
 236             *index += step;
 237             *nb -= step;
 238         } else {
 239             phys_page_set_level(map, lp, index, nb, leaf, level - 1);
 240         }
 241         ++lp;
 242     }
 243 }
 244
 245 static void phys_page_set(AddressSpaceDispatch *d,
 246                           hwaddr index, uint64_t nb,
 247                           uint16_t leaf)
 248 {
 249     /* Wildly overreserve - it doesn't matter much. */
 250     phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
 251
 252     phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
 253 }
 254
 255 /* Compact a non leaf page entry. Simply detect that the entry has a single child,
 256  * and update our entry so we can skip it and go directly to the destination.
 257  */
 258 static void phys_page_compact(PhysPageEntry *lp, Node *nodes)
 259 {
 260     unsigned valid_ptr = P_L2_SIZE;
 261     int valid = 0;
 262     PhysPageEntry *p;
 263     int i;
 264
 265     if (lp->ptr == PHYS_MAP_NODE_NIL) {
 266         return;
 267     }
 268
 269     p = nodes[lp->ptr];
 270     for (i = 0; i < P_L2_SIZE; i++) {
 271         if (p[i].ptr == PHYS_MAP_NODE_NIL) {
 272             continue;
 273         }
 274
 275         valid_ptr = i;
 276         valid++;
 277         if (p[i].skip) {
 278             phys_page_compact(&p[i], nodes);
 279         }
 280     }
 281
 282     /* We can only compress if there's only one child. */
 283     if (valid != 1) {
 284         return;
 285     }
 286
 287     assert(valid_ptr < P_L2_SIZE);
 288
 289     /* Don't compress if it won't fit in the # of bits we have. */
 290     if (P_L2_LEVELS >= (1 << 6) &&
 291         lp->skip + p[valid_ptr].skip >= (1 << 6)) {
 292         return;
 293     }
 294
 295     lp->ptr = p[valid_ptr].ptr;
 296     if (!p[valid_ptr].skip) {
 297         /* If our only child is a leaf, make this a leaf. */
 298         /* By design, we should have made this node a leaf to begin with so we
 299          * should never reach here.
 300          * But since it's so simple to handle this, let's do it just in case we
 301          * change this rule.
 302          */
 303         lp->skip = 0;
 304     } else {
 305         lp->skip += p[valid_ptr].skip;
 306     }
 307 }
 308
 309 void address_space_dispatch_compact(AddressSpaceDispatch *d)
 310 {
 311     if (d->phys_map.skip) {
 312         phys_page_compact(&d->phys_map, d->map.nodes);
 313     }
 314 }
 315
 316 static inline bool section_covers_addr(const MemoryRegionSection *section,
 317                                        hwaddr addr)
 318 {
 319     /* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means
 320      * the section must cover the entire address space.
 321      */
 322     return int128_gethi(section->size) ||
 323            range_covers_byte(section->offset_within_address_space,
 324                              int128_getlo(section->size), addr);
 325 }
 326
 327 static MemoryRegionSection *phys_page_find(AddressSpaceDispatch *d, hwaddr addr)
 328 {
 329     PhysPageEntry lp = d->phys_map, *p;
 330     Node *nodes = d->map.nodes;
 331     MemoryRegionSection *sections = d->map.sections;
 332     hwaddr index = addr >> TARGET_PAGE_BITS;
 333     int i;
 334
 335     for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
 336         if (lp.ptr == PHYS_MAP_NODE_NIL) {
 337             return &sections[PHYS_SECTION_UNASSIGNED];
 338         }
 339         p = nodes[lp.ptr];
 340         lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
 341     }
 342
 343     if (section_covers_addr(&sections[lp.ptr], addr)) {
 344         return &sections[lp.ptr];
 345     } else {
 346         return &sections[PHYS_SECTION_UNASSIGNED];
 347     }
 348 }
 349
 350 /* Called from RCU critical section */
 351 static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
 352                                                         hwaddr addr,
 353                                                         bool resolve_subpage)
 354 {
 355     MemoryRegionSection *section = atomic_read(&d->mru_section);
 356     subpage_t *subpage;
 357
 358     if (!section || section == &d->map.sections[PHYS_SECTION_UNASSIGNED] ||
 359         !section_covers_addr(section, addr)) {
 360         section = phys_page_find(d, addr);
 361         atomic_set(&d->mru_section, section);
 362     }
 363     if (resolve_subpage && section->mr->subpage) {
 364         subpage = container_of(section->mr, subpage_t, iomem);
 365         section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
 366     }
 367     return section;
 368 }
 369
 370 /* Called from RCU critical section */
 371 static MemoryRegionSection *
 372 address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *xlat,
 373                                  hwaddr *plen, bool resolve_subpage)
 374 {
 375     MemoryRegionSection *section;
 376     MemoryRegion *mr;
 377     Int128 diff;
 378
 379     section = address_space_lookup_region(d, addr, resolve_subpage);
 380     /* Compute offset within MemoryRegionSection */
 381     addr -= section->offset_within_address_space;
 382
 383     /* Compute offset within MemoryRegion */
 384     *xlat = addr + section->offset_within_region;
 385
 386     mr = section->mr;
 387
 388     /* MMIO registers can be expected to perform full-width accesses based only
 389      * on their address, without considering adjacent registers that could
 390      * decode to completely different MemoryRegions.  When such registers
 391      * exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
 392      * regions overlap wildly.  For this reason we cannot clamp the accesses
 393      * here.
 394      *
 395      * If the length is small (as is the case for address_space_ldl/stl),
 396      * everything works fine.  If the incoming length is large, however,
 397      * the caller really has to do the clamping through memory_access_size.
 398      */
 399     if (memory_region_is_ram(mr)) {
 400         diff = int128_sub(section->size, int128_make64(addr));
 401         *plen = int128_get64(int128_min(diff, int128_make64(*plen)));
 402     }
 403     return section;
 404 }
 405
 406 /**
 407  * address_space_translate_iommu - translate an address through an IOMMU
 408  * memory region and then through the target address space.
 409  *
 410  * @iommu_mr: the IOMMU memory region that we start the translation from
 411  * @addr: the address to be translated through the MMU
 412  * @xlat: the translated address offset within the destination memory region.
 413  *        It cannot be %NULL.
 414  * @plen_out: valid read/write length of the translated address. It
 415  *            cannot be %NULL.
 416  * @page_mask_out: page mask for the translated address. This
 417  *            should only be meaningful for IOMMU translated
 418  *            addresses, since there may be huge pages that this bit
 419  *            would tell. It can be %NULL if we don't care about it.
 420  * @is_write: whether the translation operation is for write
 421  * @is_mmio: whether this can be MMIO, set true if it can
 422  * @target_as: the address space targeted by the IOMMU
 423  * @attrs: transaction attributes
 424  *
 425  * This function is called from RCU critical section.  It is the common
 426  * part of flatview_do_translate and address_space_translate_cached.
 427  */
 428 static MemoryRegionSection address_space_translate_iommu(IOMMUMemoryRegion *iommu_mr,
 429                                                          hwaddr *xlat,
 430                                                          hwaddr *plen_out,
 431                                                          hwaddr *page_mask_out,
 432                                                          bool is_write,
 433                                                          bool is_mmio,
 434                                                          AddressSpace **target_as,
 435                                                          MemTxAttrs attrs)
 436 {
 437     MemoryRegionSection *section;
 438     hwaddr page_mask = (hwaddr)-1;
 439
 440     do {
 441         hwaddr addr = *xlat;
 442         IOMMUMemoryRegionClass *imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
 443         int iommu_idx = 0;
 444         IOMMUTLBEntry iotlb;
 445
 446         if (imrc->attrs_to_index) {
 447             iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
 448         }
 449
 450         iotlb = imrc->translate(iommu_mr, addr, is_write ?
 451                                 IOMMU_WO : IOMMU_RO, iommu_idx);
 452
 453         if (!(iotlb.perm & (1 << is_write))) {
 454             goto unassigned;
 455         }
 456
 457         addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
 458                 | (addr & iotlb.addr_mask));
 459         page_mask &= iotlb.addr_mask;
 460         *plen_out = MIN(*plen_out, (addr | iotlb.addr_mask) - addr + 1);
 461         *target_as = iotlb.target_as;
 462
 463         section = address_space_translate_internal(
 464                 address_space_to_dispatch(iotlb.target_as), addr, xlat,
 465                 plen_out, is_mmio);
 466
 467         iommu_mr = memory_region_get_iommu(section->mr);
 468     } while (unlikely(iommu_mr));
 469
 470     if (page_mask_out) {
 471         *page_mask_out = page_mask;
 472     }
 473     return *section;
 474
 475 unassigned:
 476     return (MemoryRegionSection) { .mr = &io_mem_unassigned };
 477 }
 478
 479 /**
 480  * flatview_do_translate - translate an address in FlatView
 481  *
 482  * @fv: the flat view that we want to translate on
 483  * @addr: the address to be translated in above address space
 484  * @xlat: the translated address offset within memory region. It
 485  *        cannot be @NULL.
 486  * @plen_out: valid read/write length of the translated address. It
 487  *            can be @NULL when we don't care about it.
 488  * @page_mask_out: page mask for the translated address. This
 489  *            should only be meaningful for IOMMU translated
 490  *            addresses, since there may be huge pages that this bit
 491  *            would tell. It can be @NULL if we don't care about it.
 492  * @is_write: whether the translation operation is for write
 493  * @is_mmio: whether this can be MMIO, set true if it can
 494  * @target_as: the address space targeted by the IOMMU
 495  * @attrs: memory transaction attributes
 496  *
 497  * This function is called from RCU critical section
 498  */
 499 static MemoryRegionSection flatview_do_translate(FlatView *fv,
 500                                                  hwaddr addr,
 501                                                  hwaddr *xlat,
 502                                                  hwaddr *plen_out,
 503                                                  hwaddr *page_mask_out,
 504                                                  bool is_write,
 505                                                  bool is_mmio,
 506                                                  AddressSpace **target_as,
 507                                                  MemTxAttrs attrs)
 508 {
 509     MemoryRegionSection *section;
 510     IOMMUMemoryRegion *iommu_mr;
 511     hwaddr plen = (hwaddr)(-1);
 512
 513     if (!plen_out) {
 514         plen_out = &plen;
 515     }
 516
 517     section = address_space_translate_internal(
 518             flatview_to_dispatch(fv), addr, xlat,
 519             plen_out, is_mmio);
 520
 521     iommu_mr = memory_region_get_iommu(section->mr);
 522     if (unlikely(iommu_mr)) {
 523         return address_space_translate_iommu(iommu_mr, xlat,
 524                                              plen_out, page_mask_out,
 525                                              is_write, is_mmio,
 526                                              target_as, attrs);
 527     }
 528     if (page_mask_out) {
 529         /* Not behind an IOMMU, use default page size. */
 530         *page_mask_out = ~TARGET_PAGE_MASK;
 531     }
 532
 533     return *section;
 534 }
 535
 536 /* Called from RCU critical section */
 537 IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
 538                                             bool is_write, MemTxAttrs attrs)
 539 {
 540     MemoryRegionSection section;
 541     hwaddr xlat, page_mask;
 542
 543     /*
 544      * This can never be MMIO, and we don't really care about plen,
 545      * but page mask.
 546      */
 547     section = flatview_do_translate(address_space_to_flatview(as), addr, &xlat,
 548                                     NULL, &page_mask, is_write, false, &as,
 549                                     attrs);
 550
 551     /* Illegal translation */
 552     if (section.mr == &io_mem_unassigned) {
 553         goto iotlb_fail;
 554     }
 555
 556     /* Convert memory region offset into address space offset */
 557     xlat += section.offset_within_address_space -
 558         section.offset_within_region;
 559
 560     return (IOMMUTLBEntry) {
 561         .target_as = as,
 562         .iova = addr & ~page_mask,
 563         .translated_addr = xlat & ~page_mask,
 564         .addr_mask = page_mask,
 565         /* IOTLBs are for DMAs, and DMA only allows on RAMs. */
 566         .perm = IOMMU_RW,
 567     };
 568
 569 iotlb_fail:
 570     return (IOMMUTLBEntry) {0};
 571 }
 572
 573 /* Called from RCU critical section */
 574 MemoryRegion *flatview_translate(FlatView *fv, hwaddr addr, hwaddr *xlat,
 575                                  hwaddr *plen, bool is_write,
 576                                  MemTxAttrs attrs)
 577 {
 578     MemoryRegion *mr;
 579     MemoryRegionSection section;
 580     AddressSpace *as = NULL;
 581
 582     /* This can be MMIO, so setup MMIO bit. */
 583     section = flatview_do_translate(fv, addr, xlat, plen, NULL,
 584                                     is_write, true, &as, attrs);
 585     mr = section.mr;
 586
 587     if (xen_enabled() && memory_access_is_direct(mr, is_write)) {
 588         hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
 589         *plen = MIN(page, *plen);
 590     }
 591
 592     return mr;
 593 }
 594
 595 typedef struct TCGIOMMUNotifier {
 596     IOMMUNotifier n;
 597     MemoryRegion *mr;
 598     CPUState *cpu;
 599     int iommu_idx;
 600     bool active;
 601 } TCGIOMMUNotifier;
 602
 603 static void tcg_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
 604 {
 605     TCGIOMMUNotifier *notifier = container_of(n, TCGIOMMUNotifier, n);
 606
 607     if (!notifier->active) {
 608         return;
 609     }
 610     tlb_flush(notifier->cpu);
 611     notifier->active = false;
 612     /* We leave the notifier struct on the list to avoid reallocating it later.
 613      * Generally the number of IOMMUs a CPU deals with will be small.
 614      * In any case we can't unregister the iommu notifier from a notify
 615      * callback.
 616      */
 617 }
 618
 619 static void tcg_register_iommu_notifier(CPUState *cpu,
 620                                         IOMMUMemoryRegion *iommu_mr,
 621                                         int iommu_idx)
 622 {
 623     /* Make sure this CPU has an IOMMU notifier registered for this
 624      * IOMMU/IOMMU index combination, so that we can flush its TLB
 625      * when the IOMMU tells us the mappings we've cached have changed.
 626      */
 627     MemoryRegion *mr = MEMORY_REGION(iommu_mr);
 628     TCGIOMMUNotifier *notifier;
 629     Error *err = NULL;
 630     int i, ret;
 631
 632     for (i = 0; i < cpu->iommu_notifiers->len; i++) {
 633         notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i);
 634         if (notifier->mr == mr && notifier->iommu_idx == iommu_idx) {
 635             break;
 636         }
 637     }
 638     if (i == cpu->iommu_notifiers->len) {
 639         /* Not found, add a new entry at the end of the array */
 640         cpu->iommu_notifiers = g_array_set_size(cpu->iommu_notifiers, i + 1);
 641         notifier = g_new0(TCGIOMMUNotifier, 1);
 642         g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i) = notifier;
 643
 644         notifier->mr = mr;
 645         notifier->iommu_idx = iommu_idx;
 646         notifier->cpu = cpu;
 647         /* Rather than trying to register interest in the specific part
 648          * of the iommu's address space that we've accessed and then
 649          * expand it later as subsequent accesses touch more of it, we
 650          * just register interest in the whole thing, on the assumption
 651          * that iommu reconfiguration will be rare.
 652          */
 653         iommu_notifier_init(&notifier->n,
 654                             tcg_iommu_unmap_notify,
 655                             IOMMU_NOTIFIER_UNMAP,
 656                             0,
 657                             HWADDR_MAX,
 658                             iommu_idx);
 659         ret = memory_region_register_iommu_notifier(notifier->mr, &notifier->n,
 660                                                     &err);
 661         if (ret) {
 662             error_report_err(err);
 663             exit(1);
 664         }
 665     }
 666
 667     if (!notifier->active) {
 668         notifier->active = true;
 669     }
 670 }
 671
 672 static void tcg_iommu_free_notifier_list(CPUState *cpu)
 673 {
 674     /* Destroy the CPU's notifier list */
 675     int i;
 676     TCGIOMMUNotifier *notifier;
 677
 678     for (i = 0; i < cpu->iommu_notifiers->len; i++) {
 679         notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i);
 680         memory_region_unregister_iommu_notifier(notifier->mr, &notifier->n);
 681         g_free(notifier);
 682     }
 683     g_array_free(cpu->iommu_notifiers, true);
 684 }
 685
 686 /* Called from RCU critical section */
 687 MemoryRegionSection *
 688 address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr,
 689                                   hwaddr *xlat, hwaddr *plen,
 690                                   MemTxAttrs attrs, int *prot)
 691 {
 692     MemoryRegionSection *section;
 693     IOMMUMemoryRegion *iommu_mr;
 694     IOMMUMemoryRegionClass *imrc;
 695     IOMMUTLBEntry iotlb;
 696     int iommu_idx;
 697     AddressSpaceDispatch *d = atomic_rcu_read(&cpu->cpu_ases[asidx].memory_dispatch);
 698
 699     for (;;) {
 700         section = address_space_translate_internal(d, addr, &addr, plen, false);
 701
 702         iommu_mr = memory_region_get_iommu(section->mr);
 703         if (!iommu_mr) {
 704             break;
 705         }
 706
 707         imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
 708
 709         iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
 710         tcg_register_iommu_notifier(cpu, iommu_mr, iommu_idx);
 711         /* We need all the permissions, so pass IOMMU_NONE so the IOMMU
 712          * doesn't short-cut its translation table walk.
 713          */
 714         iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, iommu_idx);
 715         addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
 716                 | (addr & iotlb.addr_mask));
 717         /* Update the caller's prot bits to remove permissions the IOMMU
 718          * is giving us a failure response for. If we get down to no
 719          * permissions left at all we can give up now.
 720          */
 721         if (!(iotlb.perm & IOMMU_RO)) {
 722             *prot &= ~(PAGE_READ | PAGE_EXEC);
 723         }
 724         if (!(iotlb.perm & IOMMU_WO)) {
 725             *prot &= ~PAGE_WRITE;
 726         }
 727
 728         if (!*prot) {
 729             goto translate_fail;
 730         }
 731
 732         d = flatview_to_dispatch(address_space_to_flatview(iotlb.target_as));
 733     }
 734
 735     assert(!memory_region_is_iommu(section->mr));
 736     *xlat = addr;
 737     return section;
 738
 739 translate_fail:
 740     return &d->map.sections[PHYS_SECTION_UNASSIGNED];
 741 }
 742 #endif
 743
 744 #if !defined(CONFIG_USER_ONLY)
 745
 746 static int cpu_common_post_load(void *opaque, int version_id)
 747 {
 748     CPUState *cpu = opaque;
 749
 750     /* 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the
 751        version_id is increased. */
 752     cpu->interrupt_request &= ~0x01;
 753     tlb_flush(cpu);
 754
 755     /* loadvm has just updated the content of RAM, bypassing the
 756      * usual mechanisms that ensure we flush TBs for writes to
 757      * memory we've translated code from. So we must flush all TBs,
 758      * which will now be stale.
 759      */
 760     tb_flush(cpu);
 761
 762     return 0;
 763 }
 764
 765 static int cpu_common_pre_load(void *opaque)
 766 {
 767     CPUState *cpu = opaque;
 768
 769     cpu->exception_index = -1;
 770
 771     return 0;
 772 }
 773
 774 static bool cpu_common_exception_index_needed(void *opaque)
 775 {
 776     CPUState *cpu = opaque;
 777
 778     return tcg_enabled() && cpu->exception_index != -1;
 779 }
 780
 781 static const VMStateDescription vmstate_cpu_common_exception_index = {
 782     .name = "cpu_common/exception_index",
 783     .version_id = 1,
 784     .minimum_version_id = 1,
 785     .needed = cpu_common_exception_index_needed,
 786     .fields = (VMStateField[]) {
 787         VMSTATE_INT32(exception_index, CPUState),
 788         VMSTATE_END_OF_LIST()
 789     }
 790 };
 791
 792 static bool cpu_common_crash_occurred_needed(void *opaque)
 793 {
 794     CPUState *cpu = opaque;
 795
 796     return cpu->crash_occurred;
 797 }
 798
 799 static const VMStateDescription vmstate_cpu_common_crash_occurred = {
 800     .name = "cpu_common/crash_occurred",
 801     .version_id = 1,
 802     .minimum_version_id = 1,
 803     .needed = cpu_common_crash_occurred_needed,
 804     .fields = (VMStateField[]) {
 805         VMSTATE_BOOL(crash_occurred, CPUState),
 806         VMSTATE_END_OF_LIST()
 807     }
 808 };
 809
 810 const VMStateDescription vmstate_cpu_common = {
 811     .name = "cpu_common",
 812     .version_id = 1,
 813     .minimum_version_id = 1,
 814     .pre_load = cpu_common_pre_load,
 815     .post_load = cpu_common_post_load,
 816     .fields = (VMStateField[]) {
 817         VMSTATE_UINT32(halted, CPUState),
 818         VMSTATE_UINT32(interrupt_request, CPUState),
 819         VMSTATE_END_OF_LIST()
 820     },
 821     .subsections = (const VMStateDescription*[]) {
 822         &vmstate_cpu_common_exception_index,
 823         &vmstate_cpu_common_crash_occurred,
 824         NULL
 825     }
 826 };
 827
 828 #endif
 829
 830 CPUState *qemu_get_cpu(int index)
 831 {
 832     CPUState *cpu;
 833
 834     CPU_FOREACH(cpu) {
 835         if (cpu->cpu_index == index) {
 836             return cpu;
 837         }
 838     }
 839
 840     return NULL;
 841 }
 842
 843 #if !defined(CONFIG_USER_ONLY)
 844 void cpu_address_space_init(CPUState *cpu, int asidx,
 845                             const char *prefix, MemoryRegion *mr)
 846 {
 847     CPUAddressSpace *newas;
 848     AddressSpace *as = g_new0(AddressSpace, 1);
 849     char *as_name;
 850
 851     assert(mr);
 852     as_name = g_strdup_printf("%s-%d", prefix, cpu->cpu_index);
 853     address_space_init(as, mr, as_name);
 854     g_free(as_name);
 855
 856     /* Target code should have set num_ases before calling us */
 857     assert(asidx < cpu->num_ases);
 858
 859     if (asidx == 0) {
 860         /* address space 0 gets the convenience alias */
 861         cpu->as = as;
 862     }
 863
 864     /* KVM cannot currently support multiple address spaces. */
 865     assert(asidx == 0 || !kvm_enabled());
 866
 867     if (!cpu->cpu_ases) {
 868         cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases);
 869     }
 870
 871     newas = &cpu->cpu_ases[asidx];
 872     newas->cpu = cpu;
 873     newas->as = as;
 874     if (tcg_enabled()) {
 875         newas->tcg_as_listener.log_global_after_sync = tcg_log_global_after_sync;
 876         newas->tcg_as_listener.commit = tcg_commit;
 877         memory_listener_register(&newas->tcg_as_listener, as);
 878     }
 879 }
 880
 881 AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx)
 882 {
 883     /* Return the AddressSpace corresponding to the specified index */
 884     return cpu->cpu_ases[asidx].as;
 885 }
 886 #endif
 887
 888 void cpu_exec_unrealizefn(CPUState *cpu)
 889 {
 890     CPUClass *cc = CPU_GET_CLASS(cpu);
 891
 892     cpu_list_remove(cpu);
 893
 894     if (cc->vmsd != NULL) {
 895         vmstate_unregister(NULL, cc->vmsd, cpu);
 896     }
 897     if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
 898         vmstate_unregister(NULL, &vmstate_cpu_common, cpu);
 899     }
 900 #ifndef CONFIG_USER_ONLY
 901     tcg_iommu_free_notifier_list(cpu);
 902 #endif
 903 }
 904
 905 Property cpu_common_props[] = {
 906 #ifndef CONFIG_USER_ONLY
 907     /* Create a memory property for softmmu CPU object,
 908      * so users can wire up its memory. (This can't go in hw/core/cpu.c
 909      * because that file is compiled only once for both user-mode
 910      * and system builds.) The default if no link is set up is to use
 911      * the system address space.
 912      */
 913     DEFINE_PROP_LINK("memory", CPUState, memory, TYPE_MEMORY_REGION,
 914                      MemoryRegion *),
 915 #endif
 916     DEFINE_PROP_END_OF_LIST(),
 917 };
 918
 919 void cpu_exec_initfn(CPUState *cpu)
 920 {
 921     cpu->as = NULL;
 922     cpu->num_ases = 0;
 923
 924 #ifndef CONFIG_USER_ONLY
 925     cpu->thread_id = qemu_get_thread_id();
 926     cpu->memory = system_memory;
 927     object_ref(OBJECT(cpu->memory));
 928 #endif
 929 }
 930
 931 void cpu_exec_realizefn(CPUState *cpu, Error **errp)
 932 {
 933     CPUClass *cc = CPU_GET_CLASS(cpu);
 934     static bool tcg_target_initialized;
 935
 936     cpu_list_add(cpu);
 937
 938     if (tcg_enabled() && !tcg_target_initialized) {
 939         tcg_target_initialized = true;
 940         cc->tcg_initialize();
 941     }
 942     tlb_init(cpu);
 943
 944     qemu_plugin_vcpu_init_hook(cpu);
 945
 946 #ifndef CONFIG_USER_ONLY
 947     if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
 948         vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
 949     }
 950     if (cc->vmsd != NULL) {
 951         vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu);
 952     }
 953
 954     cpu->iommu_notifiers = g_array_new(false, true, sizeof(TCGIOMMUNotifier *));
 955 #endif
 956 }
 957
 958 const char *parse_cpu_option(const char *cpu_option)
 959 {
 960     ObjectClass *oc;
 961     CPUClass *cc;
 962     gchar **model_pieces;
 963     const char *cpu_type;
 964
 965     model_pieces = g_strsplit(cpu_option, ",", 2);
 966     if (!model_pieces[0]) {
 967         error_report("-cpu option cannot be empty");
 968         exit(1);
 969     }
 970
 971     oc = cpu_class_by_name(CPU_RESOLVING_TYPE, model_pieces[0]);
 972     if (oc == NULL) {
 973         error_report("unable to find CPU model '%s'", model_pieces[0]);
 974         g_strfreev(model_pieces);
 975         exit(EXIT_FAILURE);
 976     }
 977
 978     cpu_type = object_class_get_name(oc);
 979     cc = CPU_CLASS(oc);
 980     cc->parse_features(cpu_type, model_pieces[1], &error_fatal);
 981     g_strfreev(model_pieces);
 982     return cpu_type;
 983 }
 984
 985 #if defined(CONFIG_USER_ONLY)
 986 void tb_invalidate_phys_addr(target_ulong addr)
 987 {
 988     mmap_lock();
 989     tb_invalidate_phys_page_range(addr, addr + 1);
 990     mmap_unlock();
 991 }
 992
 993 static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
 994 {
 995     tb_invalidate_phys_addr(pc);
 996 }
 997 #else
 998 void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs attrs)
 999 {
1000     ram_addr_t ram_addr;
1001     MemoryRegion *mr;
1002     hwaddr l = 1;
1003
1004     if (!tcg_enabled()) {
1005         return;
1006     }
1007
1008     RCU_READ_LOCK_GUARD();
1009     mr = address_space_translate(as, addr, &addr, &l, false, attrs);
1010     if (!(memory_region_is_ram(mr)
1011           || memory_region_is_romd(mr))) {
1012         return;
1013     }
1014     ram_addr = memory_region_get_ram_addr(mr) + addr;
1015     tb_invalidate_phys_page_range(ram_addr, ram_addr + 1);
1016 }
1017
1018 static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
1019 {
1020     MemTxAttrs attrs;
1021     hwaddr phys = cpu_get_phys_page_attrs_debug(cpu, pc, &attrs);
1022     int asidx = cpu_asidx_from_attrs(cpu, attrs);
1023     if (phys != -1) {
1024         /* Locks grabbed by tb_invalidate_phys_addr */
1025         tb_invalidate_phys_addr(cpu->cpu_ases[asidx].as,
1026                                 phys | (pc & ~TARGET_PAGE_MASK), attrs);
1027     }
1028 }
1029 #endif
1030
1031 #ifndef CONFIG_USER_ONLY
1032 /* Add a watchpoint.  */
1033 int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
1034                           int flags, CPUWatchpoint **watchpoint)
1035 {
1036     CPUWatchpoint *wp;
1037
1038     /* forbid ranges which are empty or run off the end of the address space */
1039     if (len == 0 || (addr + len - 1) < addr) {
1040         error_report("tried to set invalid watchpoint at %"
1041                      VADDR_PRIx ", len=%" VADDR_PRIu, addr, len);
1042         return -EINVAL;
1043     }
1044     wp = g_malloc(sizeof(*wp));
1045
1046     wp->vaddr = addr;
1047     wp->len = len;
1048     wp->flags = flags;
1049
1050     /* keep all GDB-injected watchpoints in front */
1051     if (flags & BP_GDB) {
1052         QTAILQ_INSERT_HEAD(&cpu->watchpoints, wp, entry);
1053     } else {
1054         QTAILQ_INSERT_TAIL(&cpu->watchpoints, wp, entry);
1055     }
1056
1057     tlb_flush_page(cpu, addr);
1058
1059     if (watchpoint)
1060         *watchpoint = wp;
1061     return 0;
1062 }
1063
1064 /* Remove a specific watchpoint.  */
1065 int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
1066                           int flags)
1067 {
1068     CPUWatchpoint *wp;
1069
1070     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
1071         if (addr == wp->vaddr && len == wp->len
1072                 && flags == (wp->flags & ~BP_WATCHPOINT_HIT)) {
1073             cpu_watchpoint_remove_by_ref(cpu, wp);
1074             return 0;
1075         }
1076     }
1077     return -ENOENT;
1078 }
1079
1080 /* Remove a specific watchpoint by reference.  */
1081 void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
1082 {
1083     QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry);
1084
1085     tlb_flush_page(cpu, watchpoint->vaddr);
1086
1087     g_free(watchpoint);
1088 }
1089
1090 /* Remove all matching watchpoints.  */
1091 void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
1092 {
1093     CPUWatchpoint *wp, *next;
1094
1095     QTAILQ_FOREACH_SAFE(wp, &cpu->watchpoints, entry, next) {
1096         if (wp->flags & mask) {
1097             cpu_watchpoint_remove_by_ref(cpu, wp);
1098         }
1099     }
1100 }
1101
1102 /* Return true if this watchpoint address matches the specified
1103  * access (ie the address range covered by the watchpoint overlaps
1104  * partially or completely with the address range covered by the
1105  * access).
1106  */
1107 static inline bool watchpoint_address_matches(CPUWatchpoint *wp,
1108                                               vaddr addr, vaddr len)
1109 {
1110     /* We know the lengths are non-zero, but a little caution is
1111      * required to avoid errors in the case where the range ends
1112      * exactly at the top of the address space and so addr + len
1113      * wraps round to zero.
1114      */
1115     vaddr wpend = wp->vaddr + wp->len - 1;
1116     vaddr addrend = addr + len - 1;
1117
1118     return !(addr > wpend || wp->vaddr > addrend);
1119 }
1120
1121 /* Return flags for watchpoints that match addr + prot.  */
1122 int cpu_watchpoint_address_matches(CPUState *cpu, vaddr addr, vaddr len)
1123 {
1124     CPUWatchpoint *wp;
1125     int ret = 0;
1126
1127     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
1128         if (watchpoint_address_matches(wp, addr, TARGET_PAGE_SIZE)) {
1129             ret |= wp->flags;
1130         }
1131     }
1132     return ret;
1133 }
1134 #endif /* !CONFIG_USER_ONLY */
1135
1136 /* Add a breakpoint.  */
1137 int cpu_breakpoint_insert(CPUState *cpu, vaddr pc, int flags,
1138                           CPUBreakpoint **breakpoint)
1139 {
1140     CPUBreakpoint *bp;
1141
1142     bp = g_malloc(sizeof(*bp));
1143
1144     bp->pc = pc;
1145     bp->flags = flags;
1146
1147     /* keep all GDB-injected breakpoints in front */
1148     if (flags & BP_GDB) {
1149         QTAILQ_INSERT_HEAD(&cpu->breakpoints, bp, entry);
1150     } else {
1151         QTAILQ_INSERT_TAIL(&cpu->breakpoints, bp, entry);
1152     }
1153
1154     breakpoint_invalidate(cpu, pc);
1155
1156     if (breakpoint) {
1157         *breakpoint = bp;
1158     }
1159     return 0;
1160 }
1161
1162 /* Remove a specific breakpoint.  */
1163 int cpu_breakpoint_remove(CPUState *cpu, vaddr pc, int flags)
1164 {
1165     CPUBreakpoint *bp;
1166
1167     QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1168         if (bp->pc == pc && bp->flags == flags) {
1169             cpu_breakpoint_remove_by_ref(cpu, bp);
1170             return 0;
1171         }
1172     }
1173     return -ENOENT;
1174 }
1175
1176 /* Remove a specific breakpoint by reference.  */
1177 void cpu_breakpoint_remove_by_ref(CPUState *cpu, CPUBreakpoint *breakpoint)
1178 {
1179     QTAILQ_REMOVE(&cpu->breakpoints, breakpoint, entry);
1180
1181     breakpoint_invalidate(cpu, breakpoint->pc);
1182
1183     g_free(breakpoint);
1184 }
1185
1186 /* Remove all matching breakpoints. */
1187 void cpu_breakpoint_remove_all(CPUState *cpu, int mask)
1188 {
1189     CPUBreakpoint *bp, *next;
1190
1191     QTAILQ_FOREACH_SAFE(bp, &cpu->breakpoints, entry, next) {
1192         if (bp->flags & mask) {
1193             cpu_breakpoint_remove_by_ref(cpu, bp);
1194         }
1195     }
1196 }
1197
1198 /* enable or disable single step mode. EXCP_DEBUG is returned by the
1199    CPU loop after each instruction */
1200 void cpu_single_step(CPUState *cpu, int enabled)
1201 {
1202     if (cpu->singlestep_enabled != enabled) {
1203         cpu->singlestep_enabled = enabled;
1204         if (kvm_enabled()) {
1205             kvm_update_guest_debug(cpu, 0);
1206         } else {
1207             /* must flush all the translated code to avoid inconsistencies */
1208             /* XXX: only flush what is necessary */
1209             tb_flush(cpu);
1210         }
1211     }
1212 }
1213
1214 void cpu_abort(CPUState *cpu, const char *fmt, ...)
1215 {
1216     va_list ap;
1217     va_list ap2;
1218
1219     va_start(ap, fmt);
1220     va_copy(ap2, ap);
1221     fprintf(stderr, "qemu: fatal: ");
1222     vfprintf(stderr, fmt, ap);
1223     fprintf(stderr, "\n");
1224     cpu_dump_state(cpu, stderr, CPU_DUMP_FPU | CPU_DUMP_CCOP);
1225     if (qemu_log_separate()) {
1226         qemu_log_lock();
1227         qemu_log("qemu: fatal: ");
1228         qemu_log_vprintf(fmt, ap2);
1229         qemu_log("\n");
1230         log_cpu_state(cpu, CPU_DUMP_FPU | CPU_DUMP_CCOP);
1231         qemu_log_flush();
1232         qemu_log_unlock();
1233         qemu_log_close();
1234     }
1235     va_end(ap2);
1236     va_end(ap);
1237     replay_finish();
1238 #if defined(CONFIG_USER_ONLY)
1239     {
1240         struct sigaction act;
1241         sigfillset(&act.sa_mask);
1242         act.sa_handler = SIG_DFL;
1243         act.sa_flags = 0;
1244         sigaction(SIGABRT, &act, NULL);
1245     }
1246 #endif
1247     abort();
1248 }
1249
1250 #if !defined(CONFIG_USER_ONLY)
1251 /* Called from RCU critical section */
1252 static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
1253 {
1254     RAMBlock *block;
1255
1256     block = atomic_rcu_read(&ram_list.mru_block);
1257     if (block && addr - block->offset < block->max_length) {
1258         return block;
1259     }
1260     RAMBLOCK_FOREACH(block) {
1261         if (addr - block->offset < block->max_length) {
1262             goto found;
1263         }
1264     }
1265
1266     fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
1267     abort();
1268
1269 found:
1270     /* It is safe to write mru_block outside the iothread lock.  This
1271      * is what happens:
1272      *
1273      *     mru_block = xxx
1274      *     rcu_read_unlock()
1275      *                                        xxx removed from list
1276      *                  rcu_read_lock()
1277      *                  read mru_block
1278      *                                        mru_block = NULL;
1279      *                                        call_rcu(reclaim_ramblock, xxx);
1280      *                  rcu_read_unlock()
1281      *
1282      * atomic_rcu_set is not needed here.  The block was already published
1283      * when it was placed into the list.  Here we're just making an extra
1284      * copy of the pointer.
1285      */
1286     ram_list.mru_block = block;
1287     return block;
1288 }
1289
1290 static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
1291 {
1292     CPUState *cpu;
1293     ram_addr_t start1;
1294     RAMBlock *block;
1295     ram_addr_t end;
1296
1297     assert(tcg_enabled());
1298     end = TARGET_PAGE_ALIGN(start + length);
1299     start &= TARGET_PAGE_MASK;
1300
1301     RCU_READ_LOCK_GUARD();
1302     block = qemu_get_ram_block(start);
1303     assert(block == qemu_get_ram_block(end - 1));
1304     start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
1305     CPU_FOREACH(cpu) {
1306         tlb_reset_dirty(cpu, start1, length);
1307     }
1308 }
1309
1310 /* Note: start and end must be within the same ram block.  */
1311 bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
1312                                               ram_addr_t length,
1313                                               unsigned client)
1314 {
1315     DirtyMemoryBlocks *blocks;
1316     unsigned long end, page;
1317     bool dirty = false;
1318     RAMBlock *ramblock;
1319     uint64_t mr_offset, mr_size;
1320
1321     if (length == 0) {
1322         return false;
1323     }
1324
1325     end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
1326     page = start >> TARGET_PAGE_BITS;
1327
1328     WITH_RCU_READ_LOCK_GUARD() {
1329         blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
1330         ramblock = qemu_get_ram_block(start);
1331         /* Range sanity check on the ramblock */
1332         assert(start >= ramblock->offset &&
1333                start + length <= ramblock->offset + ramblock->used_length);
1334
1335         while (page < end) {
1336             unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
1337             unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
1338             unsigned long num = MIN(end - page,
1339                                     DIRTY_MEMORY_BLOCK_SIZE - offset);
1340
1341             dirty |= bitmap_test_and_clear_atomic(blocks->blocks[idx],
1342                                                   offset, num);
1343             page += num;
1344         }
1345
1346         mr_offset = (ram_addr_t)(page << TARGET_PAGE_BITS) - ramblock->offset;
1347         mr_size = (end - page) << TARGET_PAGE_BITS;
1348         memory_region_clear_dirty_bitmap(ramblock->mr, mr_offset, mr_size);
1349     }
1350
1351     if (dirty && tcg_enabled()) {
1352         tlb_reset_dirty_range_all(start, length);
1353     }
1354
1355     return dirty;
1356 }
1357
1358 DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty
1359     (MemoryRegion *mr, hwaddr offset, hwaddr length, unsigned client)
1360 {
1361     DirtyMemoryBlocks *blocks;
1362     ram_addr_t start = memory_region_get_ram_addr(mr) + offset;
1363     unsigned long align = 1UL << (TARGET_PAGE_BITS + BITS_PER_LEVEL);
1364     ram_addr_t first = QEMU_ALIGN_DOWN(start, align);
1365     ram_addr_t last  = QEMU_ALIGN_UP(start + length, align);
1366     DirtyBitmapSnapshot *snap;
1367     unsigned long page, end, dest;
1368
1369     snap = g_malloc0(sizeof(*snap) +
1370                      ((last - first) >> (TARGET_PAGE_BITS + 3)));
1371     snap->start = first;
1372     snap->end   = last;
1373
1374     page = first >> TARGET_PAGE_BITS;
1375     end  = last  >> TARGET_PAGE_BITS;
1376     dest = 0;
1377
1378     WITH_RCU_READ_LOCK_GUARD() {
1379         blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
1380
1381         while (page < end) {
1382             unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
1383             unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
1384             unsigned long num = MIN(end - page,
1385                                     DIRTY_MEMORY_BLOCK_SIZE - offset);
1386
1387             assert(QEMU_IS_ALIGNED(offset, (1 << BITS_PER_LEVEL)));
1388             assert(QEMU_IS_ALIGNED(num,    (1 << BITS_PER_LEVEL)));
1389             offset >>= BITS_PER_LEVEL;
1390
1391             bitmap_copy_and_clear_atomic(snap->dirty + dest,
1392                                          blocks->blocks[idx] + offset,
1393                                          num);
1394             page += num;
1395             dest += num >> BITS_PER_LEVEL;
1396         }
1397     }
1398
1399     if (tcg_enabled()) {
1400         tlb_reset_dirty_range_all(start, length);
1401     }
1402
1403     memory_region_clear_dirty_bitmap(mr, offset, length);
1404
1405     return snap;
1406 }
1407
1408 bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap,
1409                                             ram_addr_t start,
1410                                             ram_addr_t length)
1411 {
1412     unsigned long page, end;
1413
1414     assert(start >= snap->start);
1415     assert(start + length <= snap->end);
1416
1417     end = TARGET_PAGE_ALIGN(start + length - snap->start) >> TARGET_PAGE_BITS;
1418     page = (start - snap->start) >> TARGET_PAGE_BITS;
1419
1420     while (page < end) {
1421         if (test_bit(page, snap->dirty)) {
1422             return true;
1423         }
1424         page++;
1425     }
1426     return false;
1427 }
1428
1429 /* Called from RCU critical section */
1430 hwaddr memory_region_section_get_iotlb(CPUState *cpu,
1431                                        MemoryRegionSection *section)
1432 {
1433     AddressSpaceDispatch *d = flatview_to_dispatch(section->fv);
1434     return section - d->map.sections;
1435 }
1436 #endif /* defined(CONFIG_USER_ONLY) */
1437
1438 #if !defined(CONFIG_USER_ONLY)
1439
1440 static int subpage_register(subpage_t *mmio, uint32_t start, uint32_t end,
1441                             uint16_t section);
1442 static subpage_t *subpage_init(FlatView *fv, hwaddr base);
1443
1444 static void *(*phys_mem_alloc)(size_t size, uint64_t *align, bool shared) =
1445                                qemu_anon_ram_alloc;
1446
1447 /*
1448  * Set a custom physical guest memory alloator.
1449  * Accelerators with unusual needs may need this.  Hopefully, we can
1450  * get rid of it eventually.
1451  */
1452 void phys_mem_set_alloc(void *(*alloc)(size_t, uint64_t *align, bool shared))
1453 {
1454     phys_mem_alloc = alloc;
1455 }
1456
1457 static uint16_t phys_section_add(PhysPageMap *map,
1458                                  MemoryRegionSection *section)
1459 {
1460     /* The physical section number is ORed with a page-aligned
1461      * pointer to produce the iotlb entries.  Thus it should
1462      * never overflow into the page-aligned value.
1463      */
1464     assert(map->sections_nb < TARGET_PAGE_SIZE);
1465
1466     if (map->sections_nb == map->sections_nb_alloc) {
1467         map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
1468         map->sections = g_renew(MemoryRegionSection, map->sections,
1469                                 map->sections_nb_alloc);
1470     }
1471     map->sections[map->sections_nb] = *section;
1472     memory_region_ref(section->mr);
1473     return map->sections_nb++;
1474 }
1475
1476 static void phys_section_destroy(MemoryRegion *mr)
1477 {
1478     bool have_sub_page = mr->subpage;
1479
1480     memory_region_unref(mr);
1481
1482     if (have_sub_page) {
1483         subpage_t *subpage = container_of(mr, subpage_t, iomem);
1484         object_unref(OBJECT(&subpage->iomem));
1485         g_free(subpage);
1486     }
1487 }
1488
1489 static void phys_sections_free(PhysPageMap *map)
1490 {
1491     while (map->sections_nb > 0) {
1492         MemoryRegionSection *section = &map->sections[--map->sections_nb];
1493         phys_section_destroy(section->mr);
1494     }
1495     g_free(map->sections);
1496     g_free(map->nodes);
1497 }
1498
1499 static void register_subpage(FlatView *fv, MemoryRegionSection *section)
1500 {
1501     AddressSpaceDispatch *d = flatview_to_dispatch(fv);
1502     subpage_t *subpage;
1503     hwaddr base = section->offset_within_address_space
1504         & TARGET_PAGE_MASK;
1505     MemoryRegionSection *existing = phys_page_find(d, base);
1506     MemoryRegionSection subsection = {
1507         .offset_within_address_space = base,
1508         .size = int128_make64(TARGET_PAGE_SIZE),
1509     };
1510     hwaddr start, end;
1511
1512     assert(existing->mr->subpage || existing->mr == &io_mem_unassigned);
1513
1514     if (!(existing->mr->subpage)) {
1515         subpage = subpage_init(fv, base);
1516         subsection.fv = fv;
1517         subsection.mr = &subpage->iomem;
1518         phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
1519                       phys_section_add(&d->map, &subsection));
1520     } else {
1521         subpage = container_of(existing->mr, subpage_t, iomem);
1522     }
1523     start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
1524     end = start + int128_get64(section->size) - 1;
1525     subpage_register(subpage, start, end,
1526                      phys_section_add(&d->map, section));
1527 }
1528
1529
1530 static void register_multipage(FlatView *fv,
1531                                MemoryRegionSection *section)
1532 {
1533     AddressSpaceDispatch *d = flatview_to_dispatch(fv);
1534     hwaddr start_addr = section->offset_within_address_space;
1535     uint16_t section_index = phys_section_add(&d->map, section);
1536     uint64_t num_pages = int128_get64(int128_rshift(section->size,
1537                                                     TARGET_PAGE_BITS));
1538
1539     assert(num_pages);
1540     phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
1541 }
1542
1543 /*
1544  * The range in *section* may look like this:
1545  *
1546  *      |s|PPPPPPP|s|
1547  *
1548  * where s stands for subpage and P for page.
1549  */
1550 void flatview_add_to_dispatch(FlatView *fv, MemoryRegionSection *section)
1551 {
1552     MemoryRegionSection remain = *section;
1553     Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
1554
1555     /* register first subpage */
1556     if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
1557         uint64_t left = TARGET_PAGE_ALIGN(remain.offset_within_address_space)
1558                         - remain.offset_within_address_space;
1559
1560         MemoryRegionSection now = remain;
1561         now.size = int128_min(int128_make64(left), now.size);
1562         register_subpage(fv, &now);
1563         if (int128_eq(remain.size, now.size)) {
1564             return;
1565         }
1566         remain.size = int128_sub(remain.size, now.size);
1567         remain.offset_within_address_space += int128_get64(now.size);
1568         remain.offset_within_region += int128_get64(now.size);
1569     }
1570
1571     /* register whole pages */
1572     if (int128_ge(remain.size, page_size)) {
1573         MemoryRegionSection now = remain;
1574         now.size = int128_and(now.size, int128_neg(page_size));
1575         register_multipage(fv, &now);
1576         if (int128_eq(remain.size, now.size)) {
1577             return;
1578         }
1579         remain.size = int128_sub(remain.size, now.size);
1580         remain.offset_within_address_space += int128_get64(now.size);
1581         remain.offset_within_region += int128_get64(now.size);
1582     }
1583
1584     /* register last subpage */
1585     register_subpage(fv, &remain);
1586 }
1587
1588 void qemu_flush_coalesced_mmio_buffer(void)
1589 {
1590     if (kvm_enabled())
1591         kvm_flush_coalesced_mmio_buffer();
1592 }
1593
1594 void qemu_mutex_lock_ramlist(void)
1595 {
1596     qemu_mutex_lock(&ram_list.mutex);
1597 }
1598
1599 void qemu_mutex_unlock_ramlist(void)
1600 {
1601     qemu_mutex_unlock(&ram_list.mutex);
1602 }
1603
1604 void ram_block_dump(Monitor *mon)
1605 {
1606     RAMBlock *block;
1607     char *psize;
1608
1609     RCU_READ_LOCK_GUARD();
1610     monitor_printf(mon, "%24s %8s  %18s %18s %18s\n",
1611                    "Block Name", "PSize", "Offset", "Used", "Total");
1612     RAMBLOCK_FOREACH(block) {
1613         psize = size_to_str(block->page_size);
1614         monitor_printf(mon, "%24s %8s  0x%016" PRIx64 " 0x%016" PRIx64
1615                        " 0x%016" PRIx64 "\n", block->idstr, psize,
1616                        (uint64_t)block->offset,
1617                        (uint64_t)block->used_length,
1618                        (uint64_t)block->max_length);
1619         g_free(psize);
1620     }
1621 }
1622
1623 #ifdef __linux__
1624 /*
1625  * FIXME TOCTTOU: this iterates over memory backends' mem-path, which
1626  * may or may not name the same files / on the same filesystem now as
1627  * when we actually open and map them.  Iterate over the file
1628  * descriptors instead, and use qemu_fd_getpagesize().
1629  */
1630 static int find_min_backend_pagesize(Object *obj, void *opaque)
1631 {
1632     long *hpsize_min = opaque;
1633
1634     if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
1635         HostMemoryBackend *backend = MEMORY_BACKEND(obj);
1636         long hpsize = host_memory_backend_pagesize(backend);
1637
1638         if (host_memory_backend_is_mapped(backend) && (hpsize < *hpsize_min)) {
1639             *hpsize_min = hpsize;
1640         }
1641     }
1642
1643     return 0;
1644 }
1645
1646 static int find_max_backend_pagesize(Object *obj, void *opaque)
1647 {
1648     long *hpsize_max = opaque;
1649
1650     if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
1651         HostMemoryBackend *backend = MEMORY_BACKEND(obj);
1652         long hpsize = host_memory_backend_pagesize(backend);
1653
1654         if (host_memory_backend_is_mapped(backend) && (hpsize > *hpsize_max)) {
1655             *hpsize_max = hpsize;
1656         }
1657     }
1658
1659     return 0;
1660 }
1661
1662 /*
1663  * TODO: We assume right now that all mapped host memory backends are
1664  * used as RAM, however some might be used for different purposes.
1665  */
1666 long qemu_minrampagesize(void)
1667 {
1668     long hpsize = LONG_MAX;
1669     long mainrampagesize;
1670     Object *memdev_root;
1671     MachineState *ms = MACHINE(qdev_get_machine());
1672
1673     mainrampagesize = qemu_mempath_getpagesize(mem_path);
1674
1675     /* it's possible we have memory-backend objects with
1676      * hugepage-backed RAM. these may get mapped into system
1677      * address space via -numa parameters or memory hotplug
1678      * hooks. we want to take these into account, but we
1679      * also want to make sure these supported hugepage
1680      * sizes are applicable across the entire range of memory
1681      * we may boot from, so we take the min across all
1682      * backends, and assume normal pages in cases where a
1683      * backend isn't backed by hugepages.
1684      */
1685     memdev_root = object_resolve_path("/objects", NULL);
1686     if (memdev_root) {
1687         object_child_foreach(memdev_root, find_min_backend_pagesize, &hpsize);
1688     }
1689     if (hpsize == LONG_MAX) {
1690         /* No additional memory regions found ==> Report main RAM page size */
1691         return mainrampagesize;
1692     }
1693
1694     /* If NUMA is disabled or the NUMA nodes are not backed with a
1695      * memory-backend, then there is at least one node using "normal" RAM,
1696      * so if its page size is smaller we have got to report that size instead.
1697      */
1698     if (hpsize > mainrampagesize &&
1699         (ms->numa_state == NULL ||
1700          ms->numa_state->num_nodes == 0 ||
1701          ms->numa_state->nodes[0].node_memdev == NULL)) {
1702         static bool warned;
1703         if (!warned) {
1704             error_report("Huge page support disabled (n/a for main memory).");
1705             warned = true;
1706         }
1707         return mainrampagesize;
1708     }
1709
1710     return hpsize;
1711 }
1712
1713 long qemu_maxrampagesize(void)
1714 {
1715     long pagesize = qemu_mempath_getpagesize(mem_path);
1716     Object *memdev_root = object_resolve_path("/objects", NULL);
1717
1718     if (memdev_root) {
1719         object_child_foreach(memdev_root, find_max_backend_pagesize,
1720                              &pagesize);
1721     }
1722     return pagesize;
1723 }
1724 #else
1725 long qemu_minrampagesize(void)
1726 {
1727     return qemu_real_host_page_size;
1728 }
1729 long qemu_maxrampagesize(void)
1730 {
1731     return qemu_real_host_page_size;
1732 }
1733 #endif
1734
1735 #ifdef CONFIG_POSIX
1736 static int64_t get_file_size(int fd)
1737 {
1738     int64_t size;
1739 #if defined(__linux__)
1740     struct stat st;
1741
1742     if (fstat(fd, &st) < 0) {
1743         return -errno;
1744     }
1745
1746     /* Special handling for devdax character devices */
1747     if (S_ISCHR(st.st_mode)) {
1748         g_autofree char *subsystem_path = NULL;
1749         g_autofree char *subsystem = NULL;
1750
1751         subsystem_path = g_strdup_printf("/sys/dev/char/%d:%d/subsystem",
1752                                          major(st.st_rdev), minor(st.st_rdev));
1753         subsystem = g_file_read_link(subsystem_path, NULL);
1754
1755         if (subsystem && g_str_has_suffix(subsystem, "/dax")) {
1756             g_autofree char *size_path = NULL;
1757             g_autofree char *size_str = NULL;
1758
1759             size_path = g_strdup_printf("/sys/dev/char/%d:%d/size",
1760                                     major(st.st_rdev), minor(st.st_rdev));
1761
1762             if (g_file_get_contents(size_path, &size_str, NULL, NULL)) {
1763                 return g_ascii_strtoll(size_str, NULL, 0);
1764             }
1765         }
1766     }
1767 #endif /* defined(__linux__) */
1768
1769     /* st.st_size may be zero for special files yet lseek(2) works */
1770     size = lseek(fd, 0, SEEK_END);
1771     if (size < 0) {
1772         return -errno;
1773     }
1774     return size;
1775 }
1776
1777 static int file_ram_open(const char *path,
1778                          const char *region_name,
1779                          bool *created,
1780                          Error **errp)
1781 {
1782     char *filename;
1783     char *sanitized_name;
1784     char *c;
1785     int fd = -1;
1786
1787     *created = false;
1788     for (;;) {
1789         fd = open(path, O_RDWR);
1790         if (fd >= 0) {
1791             /* @path names an existing file, use it */
1792             break;
1793         }
1794         if (errno == ENOENT) {
1795             /* @path names a file that doesn't exist, create it */
1796             fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644);
1797             if (fd >= 0) {
1798                 *created = true;
1799                 break;
1800             }
1801         } else if (errno == EISDIR) {
1802             /* @path names a directory, create a file there */
1803             /* Make name safe to use with mkstemp by replacing '/' with '_'. */
1804             sanitized_name = g_strdup(region_name);
1805             for (c = sanitized_name; *c != '\0'; c++) {
1806                 if (*c == '/') {
1807                     *c = '_';
1808                 }
1809             }
1810
1811             filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
1812                                        sanitized_name);
1813             g_free(sanitized_name);
1814
1815             fd = mkstemp(filename);
1816             if (fd >= 0) {
1817                 unlink(filename);
1818                 g_free(filename);
1819                 break;
1820             }
1821             g_free(filename);
1822         }
1823         if (errno != EEXIST && errno != EINTR) {
1824             error_setg_errno(errp, errno,
1825                              "can't open backing store %s for guest RAM",
1826                              path);
1827             return -1;
1828         }
1829         /*
1830          * Try again on EINTR and EEXIST.  The latter happens when
1831          * something else creates the file between our two open().
1832          */
1833     }
1834
1835     return fd;
1836 }
1837
1838 static void *file_ram_alloc(RAMBlock *block,
1839                             ram_addr_t memory,
1840                             int fd,
1841                             bool truncate,
1842                             Error **errp)
1843 {
1844     MachineState *ms = MACHINE(qdev_get_machine());
1845     void *area;
1846
1847     block->page_size = qemu_fd_getpagesize(fd);
1848     if (block->mr->align % block->page_size) {
1849         error_setg(errp, "alignment 0x%" PRIx64
1850                    " must be multiples of page size 0x%zx",
1851                    block->mr->align, block->page_size);
1852         return NULL;
1853     } else if (block->mr->align && !is_power_of_2(block->mr->align)) {
1854         error_setg(errp, "alignment 0x%" PRIx64
1855                    " must be a power of two", block->mr->align);
1856         return NULL;
1857     }
1858     block->mr->align = MAX(block->page_size, block->mr->align);
1859 #if defined(__s390x__)
1860     if (kvm_enabled()) {
1861         block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
1862     }
1863 #endif
1864
1865     if (memory < block->page_size) {
1866         error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
1867                    "or larger than page size 0x%zx",
1868                    memory, block->page_size);
1869         return NULL;
1870     }
1871
1872     memory = ROUND_UP(memory, block->page_size);
1873
1874     /*
1875      * ftruncate is not supported by hugetlbfs in older
1876      * hosts, so don't bother bailing out on errors.
1877      * If anything goes wrong with it under other filesystems,
1878      * mmap will fail.
1879      *
1880      * Do not truncate the non-empty backend file to avoid corrupting
1881      * the existing data in the file. Disabling shrinking is not
1882      * enough. For example, the current vNVDIMM implementation stores
1883      * the guest NVDIMM labels at the end of the backend file. If the
1884      * backend file is later extended, QEMU will not be able to find
1885      * those labels. Therefore, extending the non-empty backend file
1886      * is disabled as well.
1887      */
1888     if (truncate && ftruncate(fd, memory)) {
1889         perror("ftruncate");
1890     }
1891
1892     area = qemu_ram_mmap(fd, memory, block->mr->align,
1893                          block->flags & RAM_SHARED, block->flags & RAM_PMEM);
1894     if (area == MAP_FAILED) {
1895         error_setg_errno(errp, errno,
1896                          "unable to map backing store for guest RAM");
1897         return NULL;
1898     }
1899
1900     if (mem_prealloc) {
1901         os_mem_prealloc(fd, area, memory, ms->smp.cpus, errp);
1902         if (errp && *errp) {
1903             qemu_ram_munmap(fd, area, memory);
1904             return NULL;
1905         }
1906     }
1907
1908     block->fd = fd;
1909     return area;
1910 }
1911 #endif
1912
1913 /* Allocate space within the ram_addr_t space that governs the
1914  * dirty bitmaps.
1915  * Called with the ramlist lock held.
1916  */
1917 static ram_addr_t find_ram_offset(ram_addr_t size)
1918 {
1919     RAMBlock *block, *next_block;
1920     ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
1921
1922     assert(size != 0); /* it would hand out same offset multiple times */
1923
1924     if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
1925         return 0;
1926     }
1927
1928     RAMBLOCK_FOREACH(block) {
1929         ram_addr_t candidate, next = RAM_ADDR_MAX;
1930
1931         /* Align blocks to start on a 'long' in the bitmap
1932          * which makes the bitmap sync'ing take the fast path.
1933          */
1934         candidate = block->offset + block->max_length;
1935         candidate = ROUND_UP(candidate, BITS_PER_LONG << TARGET_PAGE_BITS);
1936
1937         /* Search for the closest following block
1938          * and find the gap.
1939          */
1940         RAMBLOCK_FOREACH(next_block) {
1941             if (next_block->offset >= candidate) {
1942                 next = MIN(next, next_block->offset);
1943             }
1944         }
1945
1946         /* If it fits remember our place and remember the size
1947          * of gap, but keep going so that we might find a smaller
1948          * gap to fill so avoiding fragmentation.
1949          */
1950         if (next - candidate >= size && next - candidate < mingap) {
1951             offset = candidate;
1952             mingap = next - candidate;
1953         }
1954
1955         trace_find_ram_offset_loop(size, candidate, offset, next, mingap);
1956     }
1957
1958     if (offset == RAM_ADDR_MAX) {
1959         fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
1960                 (uint64_t)size);
1961         abort();
1962     }
1963
1964     trace_find_ram_offset(size, offset);
1965
1966     return offset;
1967 }
1968
1969 static unsigned long last_ram_page(void)
1970 {
1971     RAMBlock *block;
1972     ram_addr_t last = 0;
1973
1974     RCU_READ_LOCK_GUARD();
1975     RAMBLOCK_FOREACH(block) {
1976         last = MAX(last, block->offset + block->max_length);
1977     }
1978     return last >> TARGET_PAGE_BITS;
1979 }
1980
1981 static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
1982 {
1983     int ret;
1984
1985     /* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
1986     if (!machine_dump_guest_core(current_machine)) {
1987         ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
1988         if (ret) {
1989             perror("qemu_madvise");
1990             fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
1991                             "but dump_guest_core=off specified\n");
1992         }
1993     }
1994 }
1995
1996 const char *qemu_ram_get_idstr(RAMBlock *rb)
1997 {
1998     return rb->idstr;
1999 }
2000
2001 void *qemu_ram_get_host_addr(RAMBlock *rb)
2002 {
2003     return rb->host;
2004 }
2005
2006 ram_addr_t qemu_ram_get_offset(RAMBlock *rb)
2007 {
2008     return rb->offset;
2009 }
2010
2011 ram_addr_t qemu_ram_get_used_length(RAMBlock *rb)
2012 {
2013     return rb->used_length;
2014 }
2015
2016 bool qemu_ram_is_shared(RAMBlock *rb)
2017 {
2018     return rb->flags & RAM_SHARED;
2019 }
2020
2021 /* Note: Only set at the start of postcopy */
2022 bool qemu_ram_is_uf_zeroable(RAMBlock *rb)
2023 {
2024     return rb->flags & RAM_UF_ZEROPAGE;
2025 }
2026
2027 void qemu_ram_set_uf_zeroable(RAMBlock *rb)
2028 {
2029     rb->flags |= RAM_UF_ZEROPAGE;
2030 }
2031
2032 bool qemu_ram_is_migratable(RAMBlock *rb)
2033 {
2034     return rb->flags & RAM_MIGRATABLE;
2035 }
2036
2037 void qemu_ram_set_migratable(RAMBlock *rb)
2038 {
2039     rb->flags |= RAM_MIGRATABLE;
2040 }
2041
2042 void qemu_ram_unset_migratable(RAMBlock *rb)
2043 {
2044     rb->flags &= ~RAM_MIGRATABLE;
2045 }
2046
2047 /* Called with iothread lock held.  */
2048 void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev)
2049 {
2050     RAMBlock *block;
2051
2052     assert(new_block);
2053     assert(!new_block->idstr[0]);
2054
2055     if (dev) {
2056         char *id = qdev_get_dev_path(dev);
2057         if (id) {
2058             snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
2059             g_free(id);
2060         }
2061     }
2062     pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
2063
2064     RCU_READ_LOCK_GUARD();
2065     RAMBLOCK_FOREACH(block) {
2066         if (block != new_block &&
2067             !strcmp(block->idstr, new_block->idstr)) {
2068             fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
2069                     new_block->idstr);
2070             abort();
2071         }
2072     }
2073 }
2074
2075 /* Called with iothread lock held.  */
2076 void qemu_ram_unset_idstr(RAMBlock *block)
2077 {
2078     /* FIXME: arch_init.c assumes that this is not called throughout
2079      * migration.  Ignore the problem since hot-unplug during migration
2080      * does not work anyway.
2081      */
2082     if (block) {
2083         memset(block->idstr, 0, sizeof(block->idstr));
2084     }
2085 }
2086
2087 size_t qemu_ram_pagesize(RAMBlock *rb)
2088 {
2089     return rb->page_size;
2090 }
2091
2092 /* Returns the largest size of page in use */
2093 size_t qemu_ram_pagesize_largest(void)
2094 {
2095     RAMBlock *block;
2096     size_t largest = 0;
2097
2098     RAMBLOCK_FOREACH(block) {
2099         largest = MAX(largest, qemu_ram_pagesize(block));
2100     }
2101
2102     return largest;
2103 }
2104
2105 static int memory_try_enable_merging(void *addr, size_t len)
2106 {
2107     if (!machine_mem_merge(current_machine)) {
2108         /* disabled by the user */
2109         return 0;
2110     }
2111
2112     return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
2113 }
2114
2115 /* Only legal before guest might have detected the memory size: e.g. on
2116  * incoming migration, or right after reset.
2117  *
2118  * As memory core doesn't know how is memory accessed, it is up to
2119  * resize callback to update device state and/or add assertions to detect
2120  * misuse, if necessary.
2121  */
2122 int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp)
2123 {
2124     assert(block);
2125
2126     newsize = HOST_PAGE_ALIGN(newsize);
2127
2128     if (block->used_length == newsize) {
2129         return 0;
2130     }
2131
2132     if (!(block->flags & RAM_RESIZEABLE)) {
2133         error_setg_errno(errp, EINVAL,
2134                          "Length mismatch: %s: 0x" RAM_ADDR_FMT
2135                          " in != 0x" RAM_ADDR_FMT, block->idstr,
2136                          newsize, block->used_length);
2137         return -EINVAL;
2138     }
2139
2140     if (block->max_length < newsize) {
2141         error_setg_errno(errp, EINVAL,
2142                          "Length too large: %s: 0x" RAM_ADDR_FMT
2143                          " > 0x" RAM_ADDR_FMT, block->idstr,
2144                          newsize, block->max_length);
2145         return -EINVAL;
2146     }
2147
2148     cpu_physical_memory_clear_dirty_range(block->offset, block->used_length);
2149     block->used_length = newsize;
2150     cpu_physical_memory_set_dirty_range(block->offset, block->used_length,
2151                                         DIRTY_CLIENTS_ALL);
2152     memory_region_set_size(block->mr, newsize);
2153     if (block->resized) {
2154         block->resized(block->idstr, newsize, block->host);
2155     }
2156     return 0;
2157 }
2158
2159 /* Called with ram_list.mutex held */
2160 static void dirty_memory_extend(ram_addr_t old_ram_size,
2161                                 ram_addr_t new_ram_size)
2162 {
2163     ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size,
2164                                              DIRTY_MEMORY_BLOCK_SIZE);
2165     ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size,
2166                                              DIRTY_MEMORY_BLOCK_SIZE);
2167     int i;
2168
2169     /* Only need to extend if block count increased */
2170     if (new_num_blocks <= old_num_blocks) {
2171         return;
2172     }
2173
2174     for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
2175         DirtyMemoryBlocks *old_blocks;
2176         DirtyMemoryBlocks *new_blocks;
2177         int j;
2178
2179         old_blocks = atomic_rcu_read(&ram_list.dirty_memory[i]);
2180         new_blocks = g_malloc(sizeof(*new_blocks) +
2181                               sizeof(new_blocks->blocks[0]) * new_num_blocks);
2182
2183         if (old_num_blocks) {
2184             memcpy(new_blocks->blocks, old_blocks->blocks,
2185                    old_num_blocks * sizeof(old_blocks->blocks[0]));
2186         }
2187
2188         for (j = old_num_blocks; j < new_num_blocks; j++) {
2189             new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
2190         }
2191
2192         atomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);
2193
2194         if (old_blocks) {
2195             g_free_rcu(old_blocks, rcu);
2196         }
2197     }
2198 }
2199
2200 static void ram_block_add(RAMBlock *new_block, Error **errp, bool shared)
2201 {
2202     RAMBlock *block;
2203     RAMBlock *last_block = NULL;
2204     ram_addr_t old_ram_size, new_ram_size;
2205     Error *err = NULL;
2206
2207     old_ram_size = last_ram_page();
2208
2209     qemu_mutex_lock_ramlist();
2210     new_block->offset = find_ram_offset(new_block->max_length);
2211
2212     if (!new_block->host) {
2213         if (xen_enabled()) {
2214             xen_ram_alloc(new_block->offset, new_block->max_length,
2215                           new_block->mr, &err);
2216             if (err) {
2217                 error_propagate(errp, err);
2218                 qemu_mutex_unlock_ramlist();
2219                 return;
2220             }
2221         } else {
2222             new_block->host = phys_mem_alloc(new_block->max_length,
2223                                              &new_block->mr->align, shared);
2224             if (!new_block->host) {
2225                 error_setg_errno(errp, errno,
2226                                  "cannot set up guest memory '%s'",
2227                                  memory_region_name(new_block->mr));
2228                 qemu_mutex_unlock_ramlist();
2229                 return;
2230             }
2231             memory_try_enable_merging(new_block->host, new_block->max_length);
2232         }
2233     }
2234
2235     new_ram_size = MAX(old_ram_size,
2236               (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS);
2237     if (new_ram_size > old_ram_size) {
2238         dirty_memory_extend(old_ram_size, new_ram_size);
2239     }
2240     /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
2241      * QLIST (which has an RCU-friendly variant) does not have insertion at
2242      * tail, so save the last element in last_block.
2243      */
2244     RAMBLOCK_FOREACH(block) {
2245         last_block = block;
2246         if (block->max_length < new_block->max_length) {
2247             break;
2248         }
2249     }
2250     if (block) {
2251         QLIST_INSERT_BEFORE_RCU(block, new_block, next);
2252     } else if (last_block) {
2253         QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
2254     } else { /* list is empty */
2255         QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
2256     }
2257     ram_list.mru_block = NULL;
2258
2259     /* Write list before version */
2260     smp_wmb();
2261     ram_list.version++;
2262     qemu_mutex_unlock_ramlist();
2263
2264     cpu_physical_memory_set_dirty_range(new_block->offset,
2265                                         new_block->used_length,
2266                                         DIRTY_CLIENTS_ALL);
2267
2268     if (new_block->host) {
2269         qemu_ram_setup_dump(new_block->host, new_block->max_length);
2270         qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
2271         /* MADV_DONTFORK is also needed by KVM in absence of synchronous MMU */
2272         qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_DONTFORK);
2273         ram_block_notify_add(new_block->host, new_block->max_length);
2274     }
2275 }
2276
2277 #ifdef CONFIG_POSIX
2278 RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
2279                                  uint32_t ram_flags, int fd,
2280                                  Error **errp)
2281 {
2282     RAMBlock *new_block;
2283     Error *local_err = NULL;
2284     int64_t file_size;
2285
2286     /* Just support these ram flags by now. */
2287     assert((ram_flags & ~(RAM_SHARED | RAM_PMEM)) == 0);
2288
2289     if (xen_enabled()) {
2290         error_setg(errp, "-mem-path not supported with Xen");
2291         return NULL;
2292     }
2293
2294     if (kvm_enabled() && !kvm_has_sync_mmu()) {
2295         error_setg(errp,
2296                    "host lacks kvm mmu notifiers, -mem-path unsupported");
2297         return NULL;
2298     }
2299
2300     if (phys_mem_alloc != qemu_anon_ram_alloc) {
2301         /*
2302          * file_ram_alloc() needs to allocate just like
2303          * phys_mem_alloc, but we haven't bothered to provide
2304          * a hook there.
2305          */
2306         error_setg(errp,
2307                    "-mem-path not supported with this accelerator");
2308         return NULL;
2309     }
2310
2311     size = HOST_PAGE_ALIGN(size);
2312     file_size = get_file_size(fd);
2313     if (file_size > 0 && file_size < size) {
2314         error_setg(errp, "backing store %s size 0x%" PRIx64
2315                    " does not match 'size' option 0x" RAM_ADDR_FMT,
2316                    mem_path, file_size, size);
2317         return NULL;
2318     }
2319
2320     new_block = g_malloc0(sizeof(*new_block));
2321     new_block->mr = mr;
2322     new_block->used_length = size;
2323     new_block->max_length = size;
2324     new_block->flags = ram_flags;
2325     new_block->host = file_ram_alloc(new_block, size, fd, !file_size, errp);
2326     if (!new_block->host) {
2327         g_free(new_block);
2328         return NULL;
2329     }
2330
2331     ram_block_add(new_block, &local_err, ram_flags & RAM_SHARED);
2332     if (local_err) {
2333         g_free(new_block);
2334         error_propagate(errp, local_err);
2335         return NULL;
2336     }
2337     return new_block;
2338
2339 }
2340
2341
2342 RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
2343                                    uint32_t ram_flags, const char *mem_path,
2344                                    Error **errp)
2345 {
2346     int fd;
2347     bool created;
2348     RAMBlock *block;
2349
2350     fd = file_ram_open(mem_path, memory_region_name(mr), &created, errp);
2351     if (fd < 0) {
2352         return NULL;
2353     }
2354
2355     block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, errp);
2356     if (!block) {
2357         if (created) {
2358             unlink(mem_path);
2359         }
2360         close(fd);
2361         return NULL;
2362     }
2363
2364     return block;
2365 }
2366 #endif
2367
2368 static
2369 RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
2370                                   void (*resized)(const char*,
2371                                                   uint64_t length,
2372                                                   void *host),
2373                                   void *host, bool resizeable, bool share,
2374                                   MemoryRegion *mr, Error **errp)
2375 {
2376     RAMBlock *new_block;
2377     Error *local_err = NULL;
2378
2379     size = HOST_PAGE_ALIGN(size);
2380     max_size = HOST_PAGE_ALIGN(max_size);
2381     new_block = g_malloc0(sizeof(*new_block));
2382     new_block->mr = mr;
2383     new_block->resized = resized;
2384     new_block->used_length = size;
2385     new_block->max_length = max_size;
2386     assert(max_size >= size);
2387     new_block->fd = -1;
2388     new_block->page_size = qemu_real_host_page_size;
2389     new_block->host = host;
2390     if (host) {
2391         new_block->flags |= RAM_PREALLOC;
2392     }
2393     if (resizeable) {
2394         new_block->flags |= RAM_RESIZEABLE;
2395     }
2396     ram_block_add(new_block, &local_err, share);
2397     if (local_err) {
2398         g_free(new_block);
2399         error_propagate(errp, local_err);
2400         return NULL;
2401     }
2402     return new_block;
2403 }
2404
2405 RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
2406                                    MemoryRegion *mr, Error **errp)
2407 {
2408     return qemu_ram_alloc_internal(size, size, NULL, host, false,
2409                                    false, mr, errp);
2410 }
2411
2412 RAMBlock *qemu_ram_alloc(ram_addr_t size, bool share,
2413                          MemoryRegion *mr, Error **errp)
2414 {
2415     return qemu_ram_alloc_internal(size, size, NULL, NULL, false,
2416                                    share, mr, errp);
2417 }
2418
2419 RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
2420                                      void (*resized)(const char*,
2421                                                      uint64_t length,
2422                                                      void *host),
2423                                      MemoryRegion *mr, Error **errp)
2424 {
2425     return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true,
2426                                    false, mr, errp);
2427 }
2428
2429 static void reclaim_ramblock(RAMBlock *block)
2430 {
2431     if (block->flags & RAM_PREALLOC) {
2432         ;
2433     } else if (xen_enabled()) {
2434         xen_invalidate_map_cache_entry(block->host);
2435 #ifndef _WIN32
2436     } else if (block->fd >= 0) {
2437         qemu_ram_munmap(block->fd, block->host, block->max_length);
2438         close(block->fd);
2439 #endif
2440     } else {
2441         qemu_anon_ram_free(block->host, block->max_length);
2442     }
2443     g_free(block);
2444 }
2445
2446 void qemu_ram_free(RAMBlock *block)
2447 {
2448     if (!block) {
2449         return;
2450     }
2451
2452     if (block->host) {
2453         ram_block_notify_remove(block->host, block->max_length);
2454     }
2455
2456     qemu_mutex_lock_ramlist();
2457     QLIST_REMOVE_RCU(block, next);
2458     ram_list.mru_block = NULL;
2459     /* Write list before version */
2460     smp_wmb();
2461     ram_list.version++;
2462     call_rcu(block, reclaim_ramblock, rcu);
2463     qemu_mutex_unlock_ramlist();
2464 }
2465
2466 #ifndef _WIN32
2467 void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
2468 {
2469     RAMBlock *block;
2470     ram_addr_t offset;
2471     int flags;
2472     void *area, *vaddr;
2473
2474     RAMBLOCK_FOREACH(block) {
2475         offset = addr - block->offset;
2476         if (offset < block->max_length) {
2477             vaddr = ramblock_ptr(block, offset);
2478             if (block->flags & RAM_PREALLOC) {
2479                 ;
2480             } else if (xen_enabled()) {
2481                 abort();
2482             } else {
2483                 flags = MAP_FIXED;
2484                 if (block->fd >= 0) {
2485                     flags |= (block->flags & RAM_SHARED ?
2486                               MAP_SHARED : MAP_PRIVATE);
2487                     area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
2488                                 flags, block->fd, offset);
2489                 } else {
2490                     /*
2491                      * Remap needs to match alloc.  Accelerators that
2492                      * set phys_mem_alloc never remap.  If they did,
2493                      * we'd need a remap hook here.
2494                      */
2495                     assert(phys_mem_alloc == qemu_anon_ram_alloc);
2496
2497                     flags |= MAP_PRIVATE | MAP_ANONYMOUS;
2498                     area = mmap(vaddr, length, PROT_READ | PROT_WRITE,
2499                                 flags, -1, 0);
2500                 }
2501                 if (area != vaddr) {
2502                     error_report("Could not remap addr: "
2503                                  RAM_ADDR_FMT "@" RAM_ADDR_FMT "",
2504                                  length, addr);
2505                     exit(1);
2506                 }
2507                 memory_try_enable_merging(vaddr, length);
2508                 qemu_ram_setup_dump(vaddr, length);
2509             }
2510         }
2511     }
2512 }
2513 #endif /* !_WIN32 */
2514
2515 /* Return a host pointer to ram allocated with qemu_ram_alloc.
2516  * This should not be used for general purpose DMA.  Use address_space_map
2517  * or address_space_rw instead. For local memory (e.g. video ram) that the
2518  * device owns, use memory_region_get_ram_ptr.
2519  *
2520  * Called within RCU critical section.
2521  */
2522 void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)
2523 {
2524     RAMBlock *block = ram_block;
2525
2526     if (block == NULL) {
2527         block = qemu_get_ram_block(addr);
2528         addr -= block->offset;
2529     }
2530
2531     if (xen_enabled() && block->host == NULL) {
2532         /* We need to check if the requested address is in the RAM
2533          * because we don't want to map the entire memory in QEMU.
2534          * In that case just map until the end of the page.
2535          */
2536         if (block->offset == 0) {
2537             return xen_map_cache(addr, 0, 0, false);
2538         }
2539
2540         block->host = xen_map_cache(block->offset, block->max_length, 1, false);
2541     }
2542     return ramblock_ptr(block, addr);
2543 }
2544
2545 /* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr
2546  * but takes a size argument.
2547  *
2548  * Called within RCU critical section.
2549  */
2550 static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
2551                                  hwaddr *size, bool lock)
2552 {
2553     RAMBlock *block = ram_block;
2554     if (*size == 0) {
2555         return NULL;
2556     }
2557
2558     if (block == NULL) {
2559         block = qemu_get_ram_block(addr);
2560         addr -= block->offset;
2561     }
2562     *size = MIN(*size, block->max_length - addr);
2563
2564     if (xen_enabled() && block->host == NULL) {
2565         /* We need to check if the requested address is in the RAM
2566          * because we don't want to map the entire memory in QEMU.
2567          * In that case just map the requested area.
2568          */
2569         if (block->offset == 0) {
2570             return xen_map_cache(addr, *size, lock, lock);
2571         }
2572
2573         block->host = xen_map_cache(block->offset, block->max_length, 1, lock);
2574     }
2575
2576     return ramblock_ptr(block, addr);
2577 }
2578
2579 /* Return the offset of a hostpointer within a ramblock */
2580 ram_addr_t qemu_ram_block_host_offset(RAMBlock *rb, void *host)
2581 {
2582     ram_addr_t res = (uint8_t *)host - (uint8_t *)rb->host;
2583     assert((uintptr_t)host >= (uintptr_t)rb->host);
2584     assert(res < rb->max_length);
2585
2586     return res;
2587 }
2588
2589 /*
2590  * Translates a host ptr back to a RAMBlock, a ram_addr and an offset
2591  * in that RAMBlock.
2592  *
2593  * ptr: Host pointer to look up
2594  * round_offset: If true round the result offset down to a page boundary
2595  * *ram_addr: set to result ram_addr
2596  * *offset: set to result offset within the RAMBlock
2597  *
2598  * Returns: RAMBlock (or NULL if not found)
2599  *
2600  * By the time this function returns, the returned pointer is not protected
2601  * by RCU anymore.  If the caller is not within an RCU critical section and
2602  * does not hold the iothread lock, it must have other means of protecting the
2603  * pointer, such as a reference to the region that includes the incoming
2604  * ram_addr_t.
2605  */
2606 RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
2607                                    ram_addr_t *offset)
2608 {
2609     RAMBlock *block;
2610     uint8_t *host = ptr;
2611
2612     if (xen_enabled()) {
2613         ram_addr_t ram_addr;
2614         RCU_READ_LOCK_GUARD();
2615         ram_addr = xen_ram_addr_from_mapcache(ptr);
2616         block = qemu_get_ram_block(ram_addr);
2617         if (block) {
2618             *offset = ram_addr - block->offset;
2619         }
2620         return block;
2621     }
2622
2623     RCU_READ_LOCK_GUARD();
2624     block = atomic_rcu_read(&ram_list.mru_block);
2625     if (block && block->host && host - block->host < block->max_length) {
2626         goto found;
2627     }
2628
2629     RAMBLOCK_FOREACH(block) {
2630         /* This case append when the block is not mapped. */
2631         if (block->host == NULL) {
2632             continue;
2633         }
2634         if (host - block->host < block->max_length) {
2635             goto found;
2636         }
2637     }
2638
2639     return NULL;
2640
2641 found:
2642     *offset = (host - block->host);
2643     if (round_offset) {
2644         *offset &= TARGET_PAGE_MASK;
2645     }
2646     return block;
2647 }
2648
2649 /*
2650  * Finds the named RAMBlock
2651  *
2652  * name: The name of RAMBlock to find
2653  *
2654  * Returns: RAMBlock (or NULL if not found)
2655  */
2656 RAMBlock *qemu_ram_block_by_name(const char *name)
2657 {
2658     RAMBlock *block;
2659
2660     RAMBLOCK_FOREACH(block) {
2661         if (!strcmp(name, block->idstr)) {
2662             return block;
2663         }
2664     }
2665
2666     return NULL;
2667 }
2668
2669 /* Some of the softmmu routines need to translate from a host pointer
2670    (typically a TLB entry) back to a ram offset.  */
2671 ram_addr_t qemu_ram_addr_from_host(void *ptr)
2672 {
2673     RAMBlock *block;
2674     ram_addr_t offset;
2675
2676     block = qemu_ram_block_from_host(ptr, false, &offset);
2677     if (!block) {
2678         return RAM_ADDR_INVALID;
2679     }
2680
2681     return block->offset + offset;
2682 }
2683
2684 /* Generate a debug exception if a watchpoint has been hit.  */
2685 void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
2686                           MemTxAttrs attrs, int flags, uintptr_t ra)
2687 {
2688     CPUClass *cc = CPU_GET_CLASS(cpu);
2689     CPUWatchpoint *wp;
2690
2691     assert(tcg_enabled());
2692     if (cpu->watchpoint_hit) {
2693         /*
2694          * We re-entered the check after replacing the TB.
2695          * Now raise the debug interrupt so that it will
2696          * trigger after the current instruction.
2697          */
2698         qemu_mutex_lock_iothread();
2699         cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
2700         qemu_mutex_unlock_iothread();
2701         return;
2702     }
2703
2704     addr = cc->adjust_watchpoint_address(cpu, addr, len);
2705     QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
2706         if (watchpoint_address_matches(wp, addr, len)
2707             && (wp->flags & flags)) {
2708             if (flags == BP_MEM_READ) {
2709                 wp->flags |= BP_WATCHPOINT_HIT_READ;
2710             } else {
2711                 wp->flags |= BP_WATCHPOINT_HIT_WRITE;
2712             }
2713             wp->hitaddr = MAX(addr, wp->vaddr);
2714             wp->hitattrs = attrs;
2715             if (!cpu->watchpoint_hit) {
2716                 if (wp->flags & BP_CPU &&
2717                     !cc->debug_check_watchpoint(cpu, wp)) {
2718                     wp->flags &= ~BP_WATCHPOINT_HIT;
2719                     continue;
2720                 }
2721                 cpu->watchpoint_hit = wp;
2722
2723                 mmap_lock();
2724                 tb_check_watchpoint(cpu, ra);
2725                 if (wp->flags & BP_STOP_BEFORE_ACCESS) {
2726                     cpu->exception_index = EXCP_DEBUG;
2727                     mmap_unlock();
2728                     cpu_loop_exit_restore(cpu, ra);
2729                 } else {
2730                     /* Force execution of one insn next time.  */
2731                     cpu->cflags_next_tb = 1 | curr_cflags();
2732                     mmap_unlock();
2733                     if (ra) {
2734                         cpu_restore_state(cpu, ra, true);
2735                     }
2736                     cpu_loop_exit_noexc(cpu);
2737                 }
2738             }
2739         } else {
2740             wp->flags &= ~BP_WATCHPOINT_HIT;
2741         }
2742     }
2743 }
2744
2745 static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
2746                                  MemTxAttrs attrs, uint8_t *buf, hwaddr len);
2747 static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
2748                                   const uint8_t *buf, hwaddr len);
2749 static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len,
2750                                   bool is_write, MemTxAttrs attrs);
2751
2752 static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data,
2753                                 unsigned len, MemTxAttrs attrs)
2754 {
2755     subpage_t *subpage = opaque;
2756     uint8_t buf[8];
2757     MemTxResult res;
2758
2759 #if defined(DEBUG_SUBPAGE)
2760     printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__,
2761            subpage, len, addr);
2762 #endif
2763     res = flatview_read(subpage->fv, addr + subpage->base, attrs, buf, len);
2764     if (res) {
2765         return res;
2766     }
2767     *data = ldn_p(buf, len);
2768     return MEMTX_OK;
2769 }
2770
2771 static MemTxResult subpage_write(void *opaque, hwaddr addr,
2772                                  uint64_t value, unsigned len, MemTxAttrs attrs)
2773 {
2774     subpage_t *subpage = opaque;
2775     uint8_t buf[8];
2776
2777 #if defined(DEBUG_SUBPAGE)
2778     printf("%s: subpage %p len %u addr " TARGET_FMT_plx
2779            " value %"PRIx64"\n",
2780            __func__, subpage, len, addr, value);
2781 #endif
2782     stn_p(buf, len, value);
2783     return flatview_write(subpage->fv, addr + subpage->base, attrs, buf, len);
2784 }
2785
2786 static bool subpage_accepts(void *opaque, hwaddr addr,
2787                             unsigned len, bool is_write,
2788                             MemTxAttrs attrs)
2789 {
2790     subpage_t *subpage = opaque;
2791 #if defined(DEBUG_SUBPAGE)
2792     printf("%s: subpage %p %c len %u addr " TARGET_FMT_plx "\n",
2793            __func__, subpage, is_write ? 'w' : 'r', len, addr);
2794 #endif
2795
2796     return flatview_access_valid(subpage->fv, addr + subpage->base,
2797                                  len, is_write, attrs);
2798 }
2799
2800 static const MemoryRegionOps subpage_ops = {
2801     .read_with_attrs = subpage_read,
2802     .write_with_attrs = subpage_write,
2803     .impl.min_access_size = 1,
2804     .impl.max_access_size = 8,
2805     .valid.min_access_size = 1,
2806     .valid.max_access_size = 8,
2807     .valid.accepts = subpage_accepts,
2808     .endianness = DEVICE_NATIVE_ENDIAN,
2809 };
2810
2811 static int subpage_register(subpage_t *mmio, uint32_t start, uint32_t end,
2812                             uint16_t section)
2813 {
2814     int idx, eidx;
2815
2816     if (start >= TARGET_PAGE_SIZE || end >= TARGET_PAGE_SIZE)
2817         return -1;
2818     idx = SUBPAGE_IDX(start);
2819     eidx = SUBPAGE_IDX(end);
2820 #if defined(DEBUG_SUBPAGE)
2821     printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n",
2822            __func__, mmio, start, end, idx, eidx, section);
2823 #endif
2824     for (; idx <= eidx; idx++) {
2825         mmio->sub_section[idx] = section;
2826     }
2827
2828     return 0;
2829 }
2830
2831 static subpage_t *subpage_init(FlatView *fv, hwaddr base)
2832 {
2833     subpage_t *mmio;
2834
2835     /* mmio->sub_section is set to PHYS_SECTION_UNASSIGNED with g_malloc0 */
2836     mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t));
2837     mmio->fv = fv;
2838     mmio->base = base;
2839     memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
2840                           NULL, TARGET_PAGE_SIZE);
2841     mmio->iomem.subpage = true;
2842 #if defined(DEBUG_SUBPAGE)
2843     printf("%s: %p base " TARGET_FMT_plx " len %08x\n", __func__,
2844            mmio, base, TARGET_PAGE_SIZE);
2845 #endif
2846
2847     return mmio;
2848 }
2849
2850 static uint16_t dummy_section(PhysPageMap *map, FlatView *fv, MemoryRegion *mr)
2851 {
2852     assert(fv);
2853     MemoryRegionSection section = {
2854         .fv = fv,
2855         .mr = mr,
2856         .offset_within_address_space = 0,
2857         .offset_within_region = 0,
2858         .size = int128_2_64(),
2859     };
2860
2861     return phys_section_add(map, &section);
2862 }
2863
2864 MemoryRegionSection *iotlb_to_section(CPUState *cpu,
2865                                       hwaddr index, MemTxAttrs attrs)
2866 {
2867     int asidx = cpu_asidx_from_attrs(cpu, attrs);
2868     CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx];
2869     AddressSpaceDispatch *d = atomic_rcu_read(&cpuas->memory_dispatch);
2870     MemoryRegionSection *sections = d->map.sections;
2871
2872     return &sections[index & ~TARGET_PAGE_MASK];
2873 }
2874
2875 static void io_mem_init(void)
2876 {
2877     memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
2878                           NULL, UINT64_MAX);
2879 }
2880
2881 AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv)
2882 {
2883     AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
2884     uint16_t n;
2885
2886     n = dummy_section(&d->map, fv, &io_mem_unassigned);
2887     assert(n == PHYS_SECTION_UNASSIGNED);
2888
2889     d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
2890
2891     return d;
2892 }
2893
2894 void address_space_dispatch_free(AddressSpaceDispatch *d)
2895 {
2896     phys_sections_free(&d->map);
2897     g_free(d);
2898 }
2899
2900 static void do_nothing(CPUState *cpu, run_on_cpu_data d)
2901 {
2902 }
2903
2904 static void tcg_log_global_after_sync(MemoryListener *listener)
2905 {
2906     CPUAddressSpace *cpuas;
2907
2908     /* Wait for the CPU to end the current TB.  This avoids the following
2909      * incorrect race:
2910      *
2911      *      vCPU                         migration
2912      *      ----------------------       -------------------------
2913      *      TLB check -> slow path
2914      *        notdirty_mem_write
2915      *          write to RAM
2916      *          mark dirty
2917      *                                   clear dirty flag
2918      *      TLB check -> fast path
2919      *                                   read memory
2920      *        write to RAM
2921      *
2922      * by pushing the migration thread's memory read after the vCPU thread has
2923      * written the memory.
2924      */
2925     if (replay_mode == REPLAY_MODE_NONE) {
2926         /*
2927          * VGA can make calls to this function while updating the screen.
2928          * In record/replay mode this causes a deadlock, because
2929          * run_on_cpu waits for rr mutex. Therefore no races are possible
2930          * in this case and no need for making run_on_cpu when
2931          * record/replay is not enabled.
2932          */
2933         cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
2934         run_on_cpu(cpuas->cpu, do_nothing, RUN_ON_CPU_NULL);
2935     }
2936 }
2937
2938 static void tcg_commit(MemoryListener *listener)
2939 {
2940     CPUAddressSpace *cpuas;
2941     AddressSpaceDispatch *d;
2942
2943     assert(tcg_enabled());
2944     /* since each CPU stores ram addresses in its TLB cache, we must
2945        reset the modified entries */
2946     cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
2947     cpu_reloading_memory_map();
2948     /* The CPU and TLB are protected by the iothread lock.
2949      * We reload the dispatch pointer now because cpu_reloading_memory_map()
2950      * may have split the RCU critical section.
2951      */
2952     d = address_space_to_dispatch(cpuas->as);
2953     atomic_rcu_set(&cpuas->memory_dispatch, d);
2954     tlb_flush(cpuas->cpu);
2955 }
2956
2957 static void memory_map_init(void)
2958 {
2959     system_memory = g_malloc(sizeof(*system_memory));
2960
2961     memory_region_init(system_memory, NULL, "system", UINT64_MAX);
2962     address_space_init(&address_space_memory, system_memory, "memory");
2963
2964     system_io = g_malloc(sizeof(*system_io));
2965     memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
2966                           65536);
2967     address_space_init(&address_space_io, system_io, "I/O");
2968 }
2969
2970 MemoryRegion *get_system_memory(void)
2971 {
2972     return system_memory;
2973 }
2974
2975 MemoryRegion *get_system_io(void)
2976 {
2977     return system_io;
2978 }
2979
2980 #endif /* !defined(CONFIG_USER_ONLY) */
2981
2982 /* physical memory access (slow version, mainly for debug) */
2983 #if defined(CONFIG_USER_ONLY)
2984 int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
2985                         uint8_t *buf, target_ulong len, int is_write)
2986 {
2987     int flags;
2988     target_ulong l, page;
2989     void * p;
2990
2991     while (len > 0) {
2992         page = addr & TARGET_PAGE_MASK;
2993         l = (page + TARGET_PAGE_SIZE) - addr;
2994         if (l > len)
2995             l = len;
2996         flags = page_get_flags(page);
2997         if (!(flags & PAGE_VALID))
2998             return -1;
2999         if (is_write) {
3000             if (!(flags & PAGE_WRITE))
3001                 return -1;
3002             /* XXX: this code should not depend on lock_user */
3003             if (!(p = lock_user(VERIFY_WRITE, addr, l, 0)))
3004                 return -1;
3005             memcpy(p, buf, l);
3006             unlock_user(p, addr, l);
3007         } else {
3008             if (!(flags & PAGE_READ))
3009                 return -1;
3010             /* XXX: this code should not depend on lock_user */
3011             if (!(p = lock_user(VERIFY_READ, addr, l, 1)))
3012                 return -1;
3013             memcpy(buf, p, l);
3014             unlock_user(p, addr, 0);
3015         }
3016         len -= l;
3017         buf += l;
3018         addr += l;
3019     }
3020     return 0;
3021 }
3022
3023 #else
3024
3025 static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
3026                                      hwaddr length)
3027 {
3028     uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr);
3029     addr += memory_region_get_ram_addr(mr);
3030
3031     /* No early return if dirty_log_mask is or becomes 0, because
3032      * cpu_physical_memory_set_dirty_range will still call
3033      * xen_modified_memory.
3034      */
3035     if (dirty_log_mask) {
3036         dirty_log_mask =
3037             cpu_physical_memory_range_includes_clean(addr, length, dirty_log_mask);
3038     }
3039     if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
3040         assert(tcg_enabled());
3041         tb_invalidate_phys_range(addr, addr + length);
3042         dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
3043     }
3044     cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask);
3045 }
3046
3047 void memory_region_flush_rom_device(MemoryRegion *mr, hwaddr addr, hwaddr size)
3048 {
3049     /*
3050      * In principle this function would work on other memory region types too,
3051      * but the ROM device use case is the only one where this operation is
3052      * necessary.  Other memory regions should use the
3053      * address_space_read/write() APIs.
3054      */
3055     assert(memory_region_is_romd(mr));
3056
3057     invalidate_and_set_dirty(mr, addr, size);
3058 }
3059
3060 static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
3061 {
3062     unsigned access_size_max = mr->ops->valid.max_access_size;
3063
3064     /* Regions are assumed to support 1-4 byte accesses unless
3065        otherwise specified.  */
3066     if (access_size_max == 0) {
3067         access_size_max = 4;
3068     }
3069
3070     /* Bound the maximum access by the alignment of the address.  */
3071     if (!mr->ops->impl.unaligned) {
3072         unsigned align_size_max = addr & -addr;
3073         if (align_size_max != 0 && align_size_max < access_size_max) {
3074             access_size_max = align_size_max;
3075         }
3076     }
3077
3078     /* Don't attempt accesses larger than the maximum.  */
3079     if (l > access_size_max) {
3080         l = access_size_max;
3081     }
3082     l = pow2floor(l);
3083
3084     return l;
3085 }
3086
3087 static bool prepare_mmio_access(MemoryRegion *mr)
3088 {
3089     bool unlocked = !qemu_mutex_iothread_locked();
3090     bool release_lock = false;
3091
3092     if (unlocked && mr->global_locking) {
3093         qemu_mutex_lock_iothread();
3094         unlocked = false;
3095         release_lock = true;
3096     }
3097     if (mr->flush_coalesced_mmio) {
3098         if (unlocked) {
3099             qemu_mutex_lock_iothread();
3100         }
3101         qemu_flush_coalesced_mmio_buffer();
3102         if (unlocked) {
3103             qemu_mutex_unlock_iothread();
3104         }
3105     }
3106
3107     return release_lock;
3108 }
3109
3110 /* Called within RCU critical section.  */
3111 static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr,
3112                                            MemTxAttrs attrs,
3113                                            const uint8_t *buf,
3114                                            hwaddr len, hwaddr addr1,
3115                                            hwaddr l, MemoryRegion *mr)
3116 {
3117     uint8_t *ptr;
3118     uint64_t val;
3119     MemTxResult result = MEMTX_OK;
3120     bool release_lock = false;
3121
3122     for (;;) {
3123         if (!memory_access_is_direct(mr, true)) {
3124             release_lock |= prepare_mmio_access(mr);
3125             l = memory_access_size(mr, l, addr1);
3126             /* XXX: could force current_cpu to NULL to avoid
3127                potential bugs */
3128             val = ldn_he_p(buf, l);
3129             result |= memory_region_dispatch_write(mr, addr1, val,
3130                                                    size_memop(l), attrs);
3131         } else {
3132             /* RAM case */
3133             ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
3134             memcpy(ptr, buf, l);
3135             invalidate_and_set_dirty(mr, addr1, l);
3136         }
3137
3138         if (release_lock) {
3139             qemu_mutex_unlock_iothread();
3140             release_lock = false;
3141         }
3142
3143         len -= l;
3144         buf += l;
3145         addr += l;
3146
3147         if (!len) {
3148             break;
3149         }
3150
3151         l = len;
3152         mr = flatview_translate(fv, addr, &addr1, &l, true, attrs);
3153     }
3154
3155     return result;
3156 }
3157
3158 /* Called from RCU critical section.  */
3159 static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
3160                                   const uint8_t *buf, hwaddr len)
3161 {
3162     hwaddr l;
3163     hwaddr addr1;
3164     MemoryRegion *mr;
3165     MemTxResult result = MEMTX_OK;
3166
3167     l = len;
3168     mr = flatview_translate(fv, addr, &addr1, &l, true, attrs);
3169     result = flatview_write_continue(fv, addr, attrs, buf, len,
3170                                      addr1, l, mr);
3171
3172     return result;
3173 }
3174
3175 /* Called within RCU critical section.  */
3176 MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
3177                                    MemTxAttrs attrs, uint8_t *buf,
3178                                    hwaddr len, hwaddr addr1, hwaddr l,
3179                                    MemoryRegion *mr)
3180 {
3181     uint8_t *ptr;
3182     uint64_t val;
3183     MemTxResult result = MEMTX_OK;
3184     bool release_lock = false;
3185
3186     for (;;) {
3187         if (!memory_access_is_direct(mr, false)) {
3188             /* I/O case */
3189             release_lock |= prepare_mmio_access(mr);
3190             l = memory_access_size(mr, l, addr1);
3191             result |= memory_region_dispatch_read(mr, addr1, &val,
3192                                                   size_memop(l), attrs);
3193             stn_he_p(buf, l, val);
3194         } else {
3195             /* RAM case */
3196             ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
3197             memcpy(buf, ptr, l);
3198         }
3199
3200         if (release_lock) {
3201             qemu_mutex_unlock_iothread();
3202             release_lock = false;
3203         }
3204
3205         len -= l;
3206         buf += l;
3207         addr += l;
3208
3209         if (!len) {
3210             break;
3211         }
3212
3213         l = len;
3214         mr = flatview_translate(fv, addr, &addr1, &l, false, attrs);
3215     }
3216
3217     return result;
3218 }
3219
3220 /* Called from RCU critical section.  */
3221 static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
3222                                  MemTxAttrs attrs, uint8_t *buf, hwaddr len)
3223 {
3224     hwaddr l;
3225     hwaddr addr1;
3226     MemoryRegion *mr;
3227
3228     l = len;
3229     mr = flatview_translate(fv, addr, &addr1, &l, false, attrs);
3230     return flatview_read_continue(fv, addr, attrs, buf, len,
3231                                   addr1, l, mr);
3232 }
3233
3234 MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
3235                                     MemTxAttrs attrs, uint8_t *buf, hwaddr len)
3236 {
3237     MemTxResult result = MEMTX_OK;
3238     FlatView *fv;
3239
3240     if (len > 0) {
3241         RCU_READ_LOCK_GUARD();
3242         fv = address_space_to_flatview(as);
3243         result = flatview_read(fv, addr, attrs, buf, len);
3244     }
3245
3246     return result;
3247 }
3248
3249 MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
3250                                 MemTxAttrs attrs,
3251                                 const uint8_t *buf, hwaddr len)
3252 {
3253     MemTxResult result = MEMTX_OK;
3254     FlatView *fv;
3255
3256     if (len > 0) {
3257         RCU_READ_LOCK_GUARD();
3258         fv = address_space_to_flatview(as);
3259         result = flatview_write(fv, addr, attrs, buf, len);
3260     }
3261
3262     return result;
3263 }
3264
3265 MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
3266                              uint8_t *buf, hwaddr len, bool is_write)
3267 {
3268     if (is_write) {
3269         return address_space_write(as, addr, attrs, buf, len);
3270     } else {
3271         return address_space_read_full(as, addr, attrs, buf, len);
3272     }
3273 }
3274
3275 void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
3276                             hwaddr len, int is_write)
3277 {
3278     address_space_rw(&address_space_memory, addr, MEMTXATTRS_UNSPECIFIED,
3279                      buf, len, is_write);
3280 }
3281
3282 enum write_rom_type {
3283     WRITE_DATA,
3284     FLUSH_CACHE,
3285 };
3286
3287 static inline MemTxResult address_space_write_rom_internal(AddressSpace *as,
3288                                                            hwaddr addr,
3289                                                            MemTxAttrs attrs,
3290                                                            const uint8_t *buf,
3291                                                            hwaddr len,
3292                                                            enum write_rom_type type)
3293 {
3294     hwaddr l;
3295     uint8_t *ptr;
3296     hwaddr addr1;
3297     MemoryRegion *mr;
3298
3299     RCU_READ_LOCK_GUARD();
3300     while (len > 0) {
3301         l = len;
3302         mr = address_space_translate(as, addr, &addr1, &l, true, attrs);
3303
3304         if (!(memory_region_is_ram(mr) ||
3305               memory_region_is_romd(mr))) {
3306             l = memory_access_size(mr, l, addr1);
3307         } else {
3308             /* ROM/RAM case */
3309             ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
3310             switch (type) {
3311             case WRITE_DATA:
3312                 memcpy(ptr, buf, l);
3313                 invalidate_and_set_dirty(mr, addr1, l);
3314                 break;
3315             case FLUSH_CACHE:
3316                 flush_icache_range((uintptr_t)ptr, (uintptr_t)ptr + l);
3317                 break;
3318             }
3319         }
3320         len -= l;
3321         buf += l;
3322         addr += l;
3323     }
3324     return MEMTX_OK;
3325 }
3326
3327 /* used for ROM loading : can write in RAM and ROM */
3328 MemTxResult address_space_write_rom(AddressSpace *as, hwaddr addr,
3329                                     MemTxAttrs attrs,
3330                                     const uint8_t *buf, hwaddr len)
3331 {
3332     return address_space_write_rom_internal(as, addr, attrs,
3333                                             buf, len, WRITE_DATA);
3334 }
3335
3336 void cpu_flush_icache_range(hwaddr start, hwaddr len)
3337 {
3338     /*
3339      * This function should do the same thing as an icache flush that was
3340      * triggered from within the guest. For TCG we are always cache coherent,
3341      * so there is no need to flush anything. For KVM / Xen we need to flush
3342      * the host's instruction cache at least.
3343      */
3344     if (tcg_enabled()) {
3345         return;
3346     }
3347
3348     address_space_write_rom_internal(&address_space_memory,
3349                                      start, MEMTXATTRS_UNSPECIFIED,
3350                                      NULL, len, FLUSH_CACHE);
3351 }
3352
3353 typedef struct {
3354     MemoryRegion *mr;
3355     void *buffer;
3356     hwaddr addr;
3357     hwaddr len;
3358     bool in_use;
3359 } BounceBuffer;
3360
3361 static BounceBuffer bounce;
3362
3363 typedef struct MapClient {
3364     QEMUBH *bh;
3365     QLIST_ENTRY(MapClient) link;
3366 } MapClient;
3367
3368 QemuMutex map_client_list_lock;
3369 static QLIST_HEAD(, MapClient) map_client_list
3370     = QLIST_HEAD_INITIALIZER(map_client_list);
3371
3372 static void cpu_unregister_map_client_do(MapClient *client)
3373 {
3374     QLIST_REMOVE(client, link);
3375     g_free(client);
3376 }
3377
3378 static void cpu_notify_map_clients_locked(void)
3379 {
3380     MapClient *client;
3381
3382     while (!QLIST_EMPTY(&map_client_list)) {
3383         client = QLIST_FIRST(&map_client_list);
3384         qemu_bh_schedule(client->bh);
3385         cpu_unregister_map_client_do(client);
3386     }
3387 }
3388
3389 void cpu_register_map_client(QEMUBH *bh)
3390 {
3391     MapClient *client = g_malloc(sizeof(*client));
3392
3393     qemu_mutex_lock(&map_client_list_lock);
3394     client->bh = bh;
3395     QLIST_INSERT_HEAD(&map_client_list, client, link);
3396     if (!atomic_read(&bounce.in_use)) {
3397         cpu_notify_map_clients_locked();
3398     }
3399     qemu_mutex_unlock(&map_client_list_lock);
3400 }
3401
3402 void cpu_exec_init_all(void)
3403 {
3404     qemu_mutex_init(&ram_list.mutex);
3405     /* The data structures we set up here depend on knowing the page size,
3406      * so no more changes can be made after this point.
3407      * In an ideal world, nothing we did before we had finished the
3408      * machine setup would care about the target page size, and we could
3409      * do this much later, rather than requiring board models to state
3410      * up front what their requirements are.
3411      */
3412     finalize_target_page_bits();
3413     io_mem_init();
3414     memory_map_init();
3415     qemu_mutex_init(&map_client_list_lock);
3416 }
3417
3418 void cpu_unregister_map_client(QEMUBH *bh)
3419 {
3420     MapClient *client;
3421
3422     qemu_mutex_lock(&map_client_list_lock);
3423     QLIST_FOREACH(client, &map_client_list, link) {
3424         if (client->bh == bh) {
3425             cpu_unregister_map_client_do(client);
3426             break;
3427         }
3428     }
3429     qemu_mutex_unlock(&map_client_list_lock);
3430 }
3431
3432 static void cpu_notify_map_clients(void)
3433 {
3434     qemu_mutex_lock(&map_client_list_lock);
3435     cpu_notify_map_clients_locked();
3436     qemu_mutex_unlock(&map_client_list_lock);
3437 }
3438
3439 static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len,
3440                                   bool is_write, MemTxAttrs attrs)
3441 {
3442     MemoryRegion *mr;
3443     hwaddr l, xlat;
3444
3445     while (len > 0) {
3446         l = len;
3447         mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
3448         if (!memory_access_is_direct(mr, is_write)) {
3449             l = memory_access_size(mr, l, addr);
3450             if (!memory_region_access_valid(mr, xlat, l, is_write, attrs)) {
3451                 return false;
3452             }
3453         }
3454
3455         len -= l;
3456         addr += l;
3457     }
3458     return true;
3459 }
3460
3461 bool address_space_access_valid(AddressSpace *as, hwaddr addr,
3462                                 hwaddr len, bool is_write,
3463                                 MemTxAttrs attrs)
3464 {
3465     FlatView *fv;
3466     bool result;
3467
3468     RCU_READ_LOCK_GUARD();
3469     fv = address_space_to_flatview(as);
3470     result = flatview_access_valid(fv, addr, len, is_write, attrs);
3471     return result;
3472 }
3473
3474 static hwaddr
3475 flatview_extend_translation(FlatView *fv, hwaddr addr,
3476                             hwaddr target_len,
3477                             MemoryRegion *mr, hwaddr base, hwaddr len,
3478                             bool is_write, MemTxAttrs attrs)
3479 {
3480     hwaddr done = 0;
3481     hwaddr xlat;
3482     MemoryRegion *this_mr;
3483
3484     for (;;) {
3485         target_len -= len;
3486         addr += len;
3487         done += len;
3488         if (target_len == 0) {
3489             return done;
3490         }
3491
3492         len = target_len;
3493         this_mr = flatview_translate(fv, addr, &xlat,
3494                                      &len, is_write, attrs);
3495         if (this_mr != mr || xlat != base + done) {
3496             return done;
3497         }
3498     }
3499 }
3500
3501 /* Map a physical memory region into a host virtual address.
3502  * May map a subset of the requested range, given by and returned in *plen.
3503  * May return NULL if resources needed to perform the mapping are exhausted.
3504  * Use only for reads OR writes - not for read-modify-write operations.
3505  * Use cpu_register_map_client() to know when retrying the map operation is
3506  * likely to succeed.
3507  */
3508 void *address_space_map(AddressSpace *as,
3509                         hwaddr addr,
3510                         hwaddr *plen,
3511                         bool is_write,
3512                         MemTxAttrs attrs)
3513 {
3514     hwaddr len = *plen;
3515     hwaddr l, xlat;
3516     MemoryRegion *mr;
3517     void *ptr;
3518     FlatView *fv;
3519
3520     if (len == 0) {
3521         return NULL;
3522     }
3523
3524     l = len;
3525     RCU_READ_LOCK_GUARD();
3526     fv = address_space_to_flatview(as);
3527     mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
3528
3529     if (!memory_access_is_direct(mr, is_write)) {
3530         if (atomic_xchg(&bounce.in_use, true)) {
3531             return NULL;
3532         }
3533         /* Avoid unbounded allocations */
3534         l = MIN(l, TARGET_PAGE_SIZE);
3535         bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
3536         bounce.addr = addr;
3537         bounce.len = l;
3538
3539         memory_region_ref(mr);
3540         bounce.mr = mr;
3541         if (!is_write) {
3542             flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED,
3543                                bounce.buffer, l);
3544         }
3545
3546         *plen = l;
3547         return bounce.buffer;
3548     }
3549
3550
3551     memory_region_ref(mr);
3552     *plen = flatview_extend_translation(fv, addr, len, mr, xlat,
3553                                         l, is_write, attrs);
3554     ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true);
3555
3556     return ptr;
3557 }
3558
3559 /* Unmaps a memory region previously mapped by address_space_map().
3560  * Will also mark the memory as dirty if is_write == 1.  access_len gives
3561  * the amount of memory that was actually read or written by the caller.
3562  */
3563 void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
3564                          int is_write, hwaddr access_len)
3565 {
3566     if (buffer != bounce.buffer) {
3567         MemoryRegion *mr;
3568         ram_addr_t addr1;
3569
3570         mr = memory_region_from_host(buffer, &addr1);
3571         assert(mr != NULL);
3572         if (is_write) {
3573             invalidate_and_set_dirty(mr, addr1, access_len);
3574         }
3575         if (xen_enabled()) {
3576             xen_invalidate_map_cache_entry(buffer);
3577         }
3578         memory_region_unref(mr);
3579         return;
3580     }
3581     if (is_write) {
3582         address_space_write(as, bounce.addr, MEMTXATTRS_UNSPECIFIED,
3583                             bounce.buffer, access_len);
3584     }
3585     qemu_vfree(bounce.buffer);
3586     bounce.buffer = NULL;
3587     memory_region_unref(bounce.mr);
3588     atomic_mb_set(&bounce.in_use, false);
3589     cpu_notify_map_clients();
3590 }
3591
3592 void *cpu_physical_memory_map(hwaddr addr,
3593                               hwaddr *plen,
3594                               int is_write)
3595 {
3596     return address_space_map(&address_space_memory, addr, plen, is_write,
3597                              MEMTXATTRS_UNSPECIFIED);
3598 }
3599
3600 void cpu_physical_memory_unmap(void *buffer, hwaddr len,
3601                                int is_write, hwaddr access_len)
3602 {
3603     return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
3604 }
3605
3606 #define ARG1_DECL                AddressSpace *as
3607 #define ARG1                     as
3608 #define SUFFIX
3609 #define TRANSLATE(...)           address_space_translate(as, __VA_ARGS__)
3610 #define RCU_READ_LOCK(...)       rcu_read_lock()
3611 #define RCU_READ_UNLOCK(...)     rcu_read_unlock()
3612 #include "memory_ldst.inc.c"
3613
3614 int64_t address_space_cache_init(MemoryRegionCache *cache,
3615                                  AddressSpace *as,
3616                                  hwaddr addr,
3617                                  hwaddr len,
3618                                  bool is_write)
3619 {
3620     AddressSpaceDispatch *d;
3621     hwaddr l;
3622     MemoryRegion *mr;
3623
3624     assert(len > 0);
3625
3626     l = len;
3627     cache->fv = address_space_get_flatview(as);
3628     d = flatview_to_dispatch(cache->fv);
3629     cache->mrs = *address_space_translate_internal(d, addr, &cache->xlat, &l, true);
3630
3631     mr = cache->mrs.mr;
3632     memory_region_ref(mr);
3633     if (memory_access_is_direct(mr, is_write)) {
3634         /* We don't care about the memory attributes here as we're only
3635          * doing this if we found actual RAM, which behaves the same
3636          * regardless of attributes; so UNSPECIFIED is fine.
3637          */
3638         l = flatview_extend_translation(cache->fv, addr, len, mr,
3639                                         cache->xlat, l, is_write,
3640                                         MEMTXATTRS_UNSPECIFIED);
3641         cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, &l, true);
3642     } else {
3643         cache->ptr = NULL;
3644     }
3645
3646     cache->len = l;
3647     cache->is_write = is_write;
3648     return l;
3649 }
3650
3651 void address_space_cache_invalidate(MemoryRegionCache *cache,
3652                                     hwaddr addr,
3653                                     hwaddr access_len)
3654 {
3655     assert(cache->is_write);
3656     if (likely(cache->ptr)) {
3657         invalidate_and_set_dirty(cache->mrs.mr, addr + cache->xlat, access_len);
3658     }
3659 }
3660
3661 void address_space_cache_destroy(MemoryRegionCache *cache)
3662 {
3663     if (!cache->mrs.mr) {
3664         return;
3665     }
3666
3667     if (xen_enabled()) {
3668         xen_invalidate_map_cache_entry(cache->ptr);
3669     }
3670     memory_region_unref(cache->mrs.mr);
3671     flatview_unref(cache->fv);
3672     cache->mrs.mr = NULL;
3673     cache->fv = NULL;
3674 }
3675
3676 /* Called from RCU critical section.  This function has the same
3677  * semantics as address_space_translate, but it only works on a
3678  * predefined range of a MemoryRegion that was mapped with
3679  * address_space_cache_init.
3680  */
3681 static inline MemoryRegion *address_space_translate_cached(
3682     MemoryRegionCache *cache, hwaddr addr, hwaddr *xlat,
3683     hwaddr *plen, bool is_write, MemTxAttrs attrs)
3684 {
3685     MemoryRegionSection section;
3686     MemoryRegion *mr;
3687     IOMMUMemoryRegion *iommu_mr;
3688     AddressSpace *target_as;
3689
3690     assert(!cache->ptr);
3691     *xlat = addr + cache->xlat;
3692
3693     mr = cache->mrs.mr;
3694     iommu_mr = memory_region_get_iommu(mr);
3695     if (!iommu_mr) {
3696         /* MMIO region.  */
3697         return mr;
3698     }
3699
3700     section = address_space_translate_iommu(iommu_mr, xlat, plen,
3701                                             NULL, is_write, true,
3702                                             &target_as, attrs);
3703     return section.mr;
3704 }
3705
3706 /* Called from RCU critical section. address_space_read_cached uses this
3707  * out of line function when the target is an MMIO or IOMMU region.
3708  */
3709 void
3710 address_space_read_cached_slow(MemoryRegionCache *cache, hwaddr addr,
3711                                    void *buf, hwaddr len)
3712 {
3713     hwaddr addr1, l;
3714     MemoryRegion *mr;
3715
3716     l = len;
3717     mr = address_space_translate_cached(cache, addr, &addr1, &l, false,
3718                                         MEMTXATTRS_UNSPECIFIED);
3719     flatview_read_continue(cache->fv,
3720                            addr, MEMTXATTRS_UNSPECIFIED, buf, len,
3721                            addr1, l, mr);
3722 }
3723
3724 /* Called from RCU critical section. address_space_write_cached uses this
3725  * out of line function when the target is an MMIO or IOMMU region.
3726  */
3727 void
3728 address_space_write_cached_slow(MemoryRegionCache *cache, hwaddr addr,
3729                                     const void *buf, hwaddr len)
3730 {
3731     hwaddr addr1, l;
3732     MemoryRegion *mr;
3733
3734     l = len;
3735     mr = address_space_translate_cached(cache, addr, &addr1, &l, true,
3736                                         MEMTXATTRS_UNSPECIFIED);
3737     flatview_write_continue(cache->fv,
3738                             addr, MEMTXATTRS_UNSPECIFIED, buf, len,
3739                             addr1, l, mr);
3740 }
3741
3742 #define ARG1_DECL                MemoryRegionCache *cache
3743 #define ARG1                     cache
3744 #define SUFFIX                   _cached_slow
3745 #define TRANSLATE(...)           address_space_translate_cached(cache, __VA_ARGS__)
3746 #define RCU_READ_LOCK()          ((void)0)
3747 #define RCU_READ_UNLOCK()        ((void)0)
3748 #include "memory_ldst.inc.c"
3749
3750 /* virtual memory access for debug (includes writing to ROM) */
3751 int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
3752                         uint8_t *buf, target_ulong len, int is_write)
3753 {
3754     hwaddr phys_addr;
3755     target_ulong l, page;
3756
3757     cpu_synchronize_state(cpu);
3758     while (len > 0) {
3759         int asidx;
3760         MemTxAttrs attrs;
3761
3762         page = addr & TARGET_PAGE_MASK;
3763         phys_addr = cpu_get_phys_page_attrs_debug(cpu, page, &attrs);
3764         asidx = cpu_asidx_from_attrs(cpu, attrs);
3765         /* if no physical page mapped, return an error */
3766         if (phys_addr == -1)
3767             return -1;
3768         l = (page + TARGET_PAGE_SIZE) - addr;
3769         if (l > len)
3770             l = len;
3771         phys_addr += (addr & ~TARGET_PAGE_MASK);
3772         if (is_write) {
3773             address_space_write_rom(cpu->cpu_ases[asidx].as, phys_addr,
3774                                     attrs, buf, l);
3775         } else {
3776             address_space_rw(cpu->cpu_ases[asidx].as, phys_addr,
3777                              attrs, buf, l, 0);
3778         }
3779         len -= l;
3780         buf += l;
3781         addr += l;
3782     }
3783     return 0;
3784 }
3785
3786 /*
3787  * Allows code that needs to deal with migration bitmaps etc to still be built
3788  * target independent.
3789  */
3790 size_t qemu_target_page_size(void)
3791 {
3792     return TARGET_PAGE_SIZE;
3793 }
3794
3795 int qemu_target_page_bits(void)
3796 {
3797     return TARGET_PAGE_BITS;
3798 }
3799
3800 int qemu_target_page_bits_min(void)
3801 {
3802     return TARGET_PAGE_BITS_MIN;
3803 }
3804 #endif
3805
3806 bool target_words_bigendian(void)
3807 {
3808 #if defined(TARGET_WORDS_BIGENDIAN)
3809     return true;
3810 #else
3811     return false;
3812 #endif
3813 }
3814
3815 #ifndef CONFIG_USER_ONLY
3816 bool cpu_physical_memory_is_io(hwaddr phys_addr)
3817 {
3818     MemoryRegion*mr;
3819     hwaddr l = 1;
3820     bool res;
3821
3822     RCU_READ_LOCK_GUARD();
3823     mr = address_space_translate(&address_space_memory,
3824                                  phys_addr, &phys_addr, &l, false,
3825                                  MEMTXATTRS_UNSPECIFIED);
3826
3827     res = !(memory_region_is_ram(mr) || memory_region_is_romd(mr));
3828     return res;
3829 }
3830
3831 int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
3832 {
3833     RAMBlock *block;
3834     int ret = 0;
3835
3836     RCU_READ_LOCK_GUARD();
3837     RAMBLOCK_FOREACH(block) {
3838         ret = func(block, opaque);
3839         if (ret) {
3840             break;
3841         }
3842     }
3843     return ret;
3844 }
3845
3846 /*
3847  * Unmap pages of memory from start to start+length such that
3848  * they a) read as 0, b) Trigger whatever fault mechanism
3849  * the OS provides for postcopy.
3850  * The pages must be unmapped by the end of the function.
3851  * Returns: 0 on success, none-0 on failure
3852  *
3853  */
3854 int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
3855 {
3856     int ret = -1;
3857
3858     uint8_t *host_startaddr = rb->host + start;
3859
3860     if ((uintptr_t)host_startaddr & (rb->page_size - 1)) {
3861         error_report("ram_block_discard_range: Unaligned start address: %p",
3862                      host_startaddr);
3863         goto err;
3864     }
3865
3866     if ((start + length) <= rb->used_length) {
3867         bool need_madvise, need_fallocate;
3868         uint8_t *host_endaddr = host_startaddr + length;
3869         if ((uintptr_t)host_endaddr & (rb->page_size - 1)) {
3870             error_report("ram_block_discard_range: Unaligned end address: %p",
3871                          host_endaddr);
3872             goto err;
3873         }
3874
3875         errno = ENOTSUP; /* If we are missing MADVISE etc */
3876
3877         /* The logic here is messy;
3878          *    madvise DONTNEED fails for hugepages
3879          *    fallocate works on hugepages and shmem
3880          */
3881         need_madvise = (rb->page_size == qemu_host_page_size);
3882         need_fallocate = rb->fd != -1;
3883         if (need_fallocate) {
3884             /* For a file, this causes the area of the file to be zero'd
3885              * if read, and for hugetlbfs also causes it to be unmapped
3886              * so a userfault will trigger.
3887              */
3888 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
3889             ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
3890                             start, length);
3891             if (ret) {
3892                 ret = -errno;
3893                 error_report("ram_block_discard_range: Failed to fallocate "
3894                              "%s:%" PRIx64 " +%zx (%d)",
3895                              rb->idstr, start, length, ret);
3896                 goto err;
3897             }
3898 #else
3899             ret = -ENOSYS;
3900             error_report("ram_block_discard_range: fallocate not available/file"
3901                          "%s:%" PRIx64 " +%zx (%d)",
3902                          rb->idstr, start, length, ret);
3903             goto err;
3904 #endif
3905         }
3906         if (need_madvise) {
3907             /* For normal RAM this causes it to be unmapped,
3908              * for shared memory it causes the local mapping to disappear
3909              * and to fall back on the file contents (which we just
3910              * fallocate'd away).
3911              */
3912 #if defined(CONFIG_MADVISE)
3913             ret =  madvise(host_startaddr, length, MADV_DONTNEED);
3914             if (ret) {
3915                 ret = -errno;
3916                 error_report("ram_block_discard_range: Failed to discard range "
3917                              "%s:%" PRIx64 " +%zx (%d)",
3918                              rb->idstr, start, length, ret);
3919                 goto err;
3920             }
3921 #else
3922             ret = -ENOSYS;
3923             error_report("ram_block_discard_range: MADVISE not available"
3924                          "%s:%" PRIx64 " +%zx (%d)",
3925                          rb->idstr, start, length, ret);
3926             goto err;
3927 #endif
3928         }
3929         trace_ram_block_discard_range(rb->idstr, host_startaddr, length,
3930                                       need_madvise, need_fallocate, ret);
3931     } else {
3932         error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64
3933                      "/%zx/" RAM_ADDR_FMT")",
3934                      rb->idstr, start, length, rb->used_length);
3935     }
3936
3937 err:
3938     return ret;
3939 }
3940
3941 bool ramblock_is_pmem(RAMBlock *rb)
3942 {
3943     return rb->flags & RAM_PMEM;
3944 }
3945
3946 #endif
3947
3948 void page_size_init(void)
3949 {
3950     /* NOTE: we can always suppose that qemu_host_page_size >=
3951        TARGET_PAGE_SIZE */
3952     if (qemu_host_page_size == 0) {
3953         qemu_host_page_size = qemu_real_host_page_size;
3954     }
3955     if (qemu_host_page_size < TARGET_PAGE_SIZE) {
3956         qemu_host_page_size = TARGET_PAGE_SIZE;
3957     }
3958     qemu_host_page_mask = -(intptr_t)qemu_host_page_size;
3959 }
3960
3961 #if !defined(CONFIG_USER_ONLY)
3962
3963 static void mtree_print_phys_entries(int start, int end, int skip, int ptr)
3964 {
3965     if (start == end - 1) {
3966         qemu_printf("\t%3d      ", start);
3967     } else {
3968         qemu_printf("\t%3d..%-3d ", start, end - 1);
3969     }
3970     qemu_printf(" skip=%d ", skip);
3971     if (ptr == PHYS_MAP_NODE_NIL) {
3972         qemu_printf(" ptr=NIL");
3973     } else if (!skip) {
3974         qemu_printf(" ptr=#%d", ptr);
3975     } else {
3976         qemu_printf(" ptr=[%d]", ptr);
3977     }
3978     qemu_printf("\n");
3979 }
3980
3981 #define MR_SIZE(size) (int128_nz(size) ? (hwaddr)int128_get64( \
3982                            int128_sub((size), int128_one())) : 0)
3983
3984 void mtree_print_dispatch(AddressSpaceDispatch *d, MemoryRegion *root)
3985 {
3986     int i;
3987
3988     qemu_printf("  Dispatch\n");
3989     qemu_printf("    Physical sections\n");
3990
3991     for (i = 0; i < d->map.sections_nb; ++i) {
3992         MemoryRegionSection *s = d->map.sections + i;
3993         const char *names[] = { " [unassigned]", " [not dirty]",
3994                                 " [ROM]", " [watch]" };
3995
3996         qemu_printf("      #%d @" TARGET_FMT_plx ".." TARGET_FMT_plx
3997                     " %s%s%s%s%s",
3998             i,
3999             s->offset_within_address_space,
4000             s->offset_within_address_space + MR_SIZE(s->mr->size),
4001             s->mr->name ? s->mr->name : "(noname)",
4002             i < ARRAY_SIZE(names) ? names[i] : "",
4003             s->mr == root ? " [ROOT]" : "",
4004             s == d->mru_section ? " [MRU]" : "",
4005             s->mr->is_iommu ? " [iommu]" : "");
4006
4007         if (s->mr->alias) {
4008             qemu_printf(" alias=%s", s->mr->alias->name ?
4009                     s->mr->alias->name : "noname");
4010         }
4011         qemu_printf("\n");
4012     }
4013
4014     qemu_printf("    Nodes (%d bits per level, %d levels) ptr=[%d] skip=%d\n",
4015                P_L2_BITS, P_L2_LEVELS, d->phys_map.ptr, d->phys_map.skip);
4016     for (i = 0; i < d->map.nodes_nb; ++i) {
4017         int j, jprev;
4018         PhysPageEntry prev;
4019         Node *n = d->map.nodes + i;
4020
4021         qemu_printf("      [%d]\n", i);
4022
4023         for (j = 0, jprev = 0, prev = *n[0]; j < ARRAY_SIZE(*n); ++j) {
4024             PhysPageEntry *pe = *n + j;
4025
4026             if (pe->ptr == prev.ptr && pe->skip == prev.skip) {
4027                 continue;
4028             }
4029
4030             mtree_print_phys_entries(jprev, j, prev.skip, prev.ptr);
4031
4032             jprev = j;
4033             prev = *pe;
4034         }
4035
4036         if (jprev != ARRAY_SIZE(*n)) {
4037             mtree_print_phys_entries(jprev, j, prev.skip, prev.ptr);
4038         }
4039     }
4040 }
4041
4042 #endif