accel/tcg/ldst_atomicity.c.inc

   1 /*
   2  * Routines common to user and system emulation of load/store.
   3  *
   4  *  Copyright (c) 2022 Linaro, Ltd.
   5  *
   6  * SPDX-License-Identifier: GPL-2.0-or-later
   7  *
   8  * This work is licensed under the terms of the GNU GPL, version 2 or later.
   9  * See the COPYING file in the top-level directory.
  10  */
  11
  12 #include "host/load-extract-al16-al8.h"
  13 #include "host/store-insert-al16.h"
  14
  15 #ifdef CONFIG_ATOMIC64
  16 # define HAVE_al8          true
  17 #else
  18 # define HAVE_al8          false
  19 #endif
  20 #define HAVE_al8_fast      (ATOMIC_REG_SIZE >= 8)
  21
  22 /**
  23  * required_atomicity:
  24  *
  25  * Return the lg2 bytes of atomicity required by @memop for @p.
  26  * If the operation must be split into two operations to be
  27  * examined separately for atomicity, return -lg2.
  28  */
  29 static int required_atomicity(CPUState *cpu, uintptr_t p, MemOp memop)
  30 {
  31     MemOp atom = memop & MO_ATOM_MASK;
  32     MemOp size = memop & MO_SIZE;
  33     MemOp half = size ? size - 1 : 0;
  34     unsigned tmp;
  35     int atmax;
  36
  37     switch (atom) {
  38     case MO_ATOM_NONE:
  39         atmax = MO_8;
  40         break;
  41
  42     case MO_ATOM_IFALIGN_PAIR:
  43         size = half;
  44         /* fall through */
  45
  46     case MO_ATOM_IFALIGN:
  47         tmp = (1 << size) - 1;
  48         atmax = p & tmp ? MO_8 : size;
  49         break;
  50
  51     case MO_ATOM_WITHIN16:
  52         tmp = p & 15;
  53         atmax = (tmp + (1 << size) <= 16 ? size : MO_8);
  54         break;
  55
  56     case MO_ATOM_WITHIN16_PAIR:
  57         tmp = p & 15;
  58         if (tmp + (1 << size) <= 16) {
  59             atmax = size;
  60         } else if (tmp + (1 << half) == 16) {
  61             /*
  62              * The pair exactly straddles the boundary.
  63              * Both halves are naturally aligned and atomic.
  64              */
  65             atmax = half;
  66         } else {
  67             /*
  68              * One of the pair crosses the boundary, and is non-atomic.
  69              * The other of the pair does not cross, and is atomic.
  70              */
  71             atmax = -half;
  72         }
  73         break;
  74
  75     case MO_ATOM_SUBALIGN:
  76         /*
  77          * Examine the alignment of p to determine if there are subobjects
  78          * that must be aligned.  Note that we only really need ctz4() --
  79          * any more sigificant bits are discarded by the immediately
  80          * following comparison.
  81          */
  82         tmp = ctz32(p);
  83         atmax = MIN(size, tmp);
  84         break;
  85
  86     default:
  87         g_assert_not_reached();
  88     }
  89
  90     /*
  91      * Here we have the architectural atomicity of the operation.
  92      * However, when executing in a serial context, we need no extra
  93      * host atomicity in order to avoid racing.  This reduction
  94      * avoids looping with cpu_loop_exit_atomic.
  95      */
  96     if (cpu_in_serial_context(cpu)) {
  97         return MO_8;
  98     }
  99     return atmax;
 100 }
 101
 102 /**
 103  * load_atomic2:
 104  * @pv: host address
 105  *
 106  * Atomically load 2 aligned bytes from @pv.
 107  */
 108 static inline uint16_t load_atomic2(void *pv)
 109 {
 110     uint16_t *p = __builtin_assume_aligned(pv, 2);
 111     return qatomic_read(p);
 112 }
 113
 114 /**
 115  * load_atomic4:
 116  * @pv: host address
 117  *
 118  * Atomically load 4 aligned bytes from @pv.
 119  */
 120 static inline uint32_t load_atomic4(void *pv)
 121 {
 122     uint32_t *p = __builtin_assume_aligned(pv, 4);
 123     return qatomic_read(p);
 124 }
 125
 126 /**
 127  * load_atomic8:
 128  * @pv: host address
 129  *
 130  * Atomically load 8 aligned bytes from @pv.
 131  */
 132 static inline uint64_t load_atomic8(void *pv)
 133 {
 134     uint64_t *p = __builtin_assume_aligned(pv, 8);
 135
 136     qemu_build_assert(HAVE_al8);
 137     return qatomic_read__nocheck(p);
 138 }
 139
 140 /**
 141  * load_atomic8_or_exit:
 142  * @cpu: generic cpu state
 143  * @ra: host unwind address
 144  * @pv: host address
 145  *
 146  * Atomically load 8 aligned bytes from @pv.
 147  * If this is not possible, longjmp out to restart serially.
 148  */
 149 static uint64_t load_atomic8_or_exit(CPUState *cpu, uintptr_t ra, void *pv)
 150 {
 151     if (HAVE_al8) {
 152         return load_atomic8(pv);
 153     }
 154
 155 #ifdef CONFIG_USER_ONLY
 156     /*
 157      * If the page is not writable, then assume the value is immutable
 158      * and requires no locking.  This ignores the case of MAP_SHARED with
 159      * another process, because the fallback start_exclusive solution
 160      * provides no protection across processes.
 161      */
 162     WITH_MMAP_LOCK_GUARD() {
 163         if (!page_check_range(h2g(pv), 8, PAGE_WRITE_ORG)) {
 164             uint64_t *p = __builtin_assume_aligned(pv, 8);
 165             return *p;
 166         }
 167     }
 168 #endif
 169
 170     /* Ultimate fallback: re-execute in serial context. */
 171     cpu_loop_exit_atomic(cpu, ra);
 172 }
 173
 174 /**
 175  * load_atomic16_or_exit:
 176  * @cpu: generic cpu state
 177  * @ra: host unwind address
 178  * @pv: host address
 179  *
 180  * Atomically load 16 aligned bytes from @pv.
 181  * If this is not possible, longjmp out to restart serially.
 182  */
 183 static Int128 load_atomic16_or_exit(CPUState *cpu, uintptr_t ra, void *pv)
 184 {
 185     Int128 *p = __builtin_assume_aligned(pv, 16);
 186
 187     if (HAVE_ATOMIC128_RO) {
 188         return atomic16_read_ro(p);
 189     }
 190
 191     /*
 192      * We can only use cmpxchg to emulate a load if the page is writable.
 193      * If the page is not writable, then assume the value is immutable
 194      * and requires no locking.  This ignores the case of MAP_SHARED with
 195      * another process, because the fallback start_exclusive solution
 196      * provides no protection across processes.
 197      *
 198      * In system mode all guest pages are writable.  For user mode,
 199      * we must take mmap_lock so that the query remains valid until
 200      * the write is complete -- tests/tcg/multiarch/munmap-pthread.c
 201      * is an example that can race.
 202      */
 203     WITH_MMAP_LOCK_GUARD() {
 204 #ifdef CONFIG_USER_ONLY
 205         if (!page_check_range(h2g(p), 16, PAGE_WRITE_ORG)) {
 206             return *p;
 207         }
 208 #endif
 209         if (HAVE_ATOMIC128_RW) {
 210             return atomic16_read_rw(p);
 211         }
 212     }
 213
 214     /* Ultimate fallback: re-execute in serial context. */
 215     cpu_loop_exit_atomic(cpu, ra);
 216 }
 217
 218 /**
 219  * load_atom_extract_al4x2:
 220  * @pv: host address
 221  *
 222  * Load 4 bytes from @p, from two sequential atomic 4-byte loads.
 223  */
 224 static uint32_t load_atom_extract_al4x2(void *pv)
 225 {
 226     uintptr_t pi = (uintptr_t)pv;
 227     int sh = (pi & 3) * 8;
 228     uint32_t a, b;
 229
 230     pv = (void *)(pi & ~3);
 231     a = load_atomic4(pv);
 232     b = load_atomic4(pv + 4);
 233
 234     if (HOST_BIG_ENDIAN) {
 235         return (a << sh) | (b >> (-sh & 31));
 236     } else {
 237         return (a >> sh) | (b << (-sh & 31));
 238     }
 239 }
 240
 241 /**
 242  * load_atom_extract_al8x2:
 243  * @pv: host address
 244  *
 245  * Load 8 bytes from @p, from two sequential atomic 8-byte loads.
 246  */
 247 static uint64_t load_atom_extract_al8x2(void *pv)
 248 {
 249     uintptr_t pi = (uintptr_t)pv;
 250     int sh = (pi & 7) * 8;
 251     uint64_t a, b;
 252
 253     pv = (void *)(pi & ~7);
 254     a = load_atomic8(pv);
 255     b = load_atomic8(pv + 8);
 256
 257     if (HOST_BIG_ENDIAN) {
 258         return (a << sh) | (b >> (-sh & 63));
 259     } else {
 260         return (a >> sh) | (b << (-sh & 63));
 261     }
 262 }
 263
 264 /**
 265  * load_atom_extract_al8_or_exit:
 266  * @cpu: generic cpu state
 267  * @ra: host unwind address
 268  * @pv: host address
 269  * @s: object size in bytes, @s <= 4.
 270  *
 271  * Atomically load @s bytes from @p, when p % s != 0, and [p, p+s-1] does
 272  * not cross an 8-byte boundary.  This means that we can perform an atomic
 273  * 8-byte load and extract.
 274  * The value is returned in the low bits of a uint32_t.
 275  */
 276 static uint32_t load_atom_extract_al8_or_exit(CPUState *cpu, uintptr_t ra,
 277                                               void *pv, int s)
 278 {
 279     uintptr_t pi = (uintptr_t)pv;
 280     int o = pi & 7;
 281     int shr = (HOST_BIG_ENDIAN ? 8 - s - o : o) * 8;
 282
 283     pv = (void *)(pi & ~7);
 284     return load_atomic8_or_exit(cpu, ra, pv) >> shr;
 285 }
 286
 287 /**
 288  * load_atom_extract_al16_or_exit:
 289  * @cpu: generic cpu state
 290  * @ra: host unwind address
 291  * @p: host address
 292  * @s: object size in bytes, @s <= 8.
 293  *
 294  * Atomically load @s bytes from @p, when p % 16 < 8
 295  * and p % 16 + s > 8.  I.e. does not cross a 16-byte
 296  * boundary, but *does* cross an 8-byte boundary.
 297  * This is the slow version, so we must have eliminated
 298  * any faster load_atom_extract_al8_or_exit case.
 299  *
 300  * If this is not possible, longjmp out to restart serially.
 301  */
 302 static uint64_t load_atom_extract_al16_or_exit(CPUState *cpu, uintptr_t ra,
 303                                                void *pv, int s)
 304 {
 305     uintptr_t pi = (uintptr_t)pv;
 306     int o = pi & 7;
 307     int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8;
 308     Int128 r;
 309
 310     /*
 311      * Note constraints above: p & 8 must be clear.
 312      * Provoke SIGBUS if possible otherwise.
 313      */
 314     pv = (void *)(pi & ~7);
 315     r = load_atomic16_or_exit(cpu, ra, pv);
 316
 317     r = int128_urshift(r, shr);
 318     return int128_getlo(r);
 319 }
 320
 321 /**
 322  * load_atom_4_by_2:
 323  * @pv: host address
 324  *
 325  * Load 4 bytes from @pv, with two 2-byte atomic loads.
 326  */
 327 static inline uint32_t load_atom_4_by_2(void *pv)
 328 {
 329     uint32_t a = load_atomic2(pv);
 330     uint32_t b = load_atomic2(pv + 2);
 331
 332     if (HOST_BIG_ENDIAN) {
 333         return (a << 16) | b;
 334     } else {
 335         return (b << 16) | a;
 336     }
 337 }
 338
 339 /**
 340  * load_atom_8_by_2:
 341  * @pv: host address
 342  *
 343  * Load 8 bytes from @pv, with four 2-byte atomic loads.
 344  */
 345 static inline uint64_t load_atom_8_by_2(void *pv)
 346 {
 347     uint32_t a = load_atom_4_by_2(pv);
 348     uint32_t b = load_atom_4_by_2(pv + 4);
 349
 350     if (HOST_BIG_ENDIAN) {
 351         return ((uint64_t)a << 32) | b;
 352     } else {
 353         return ((uint64_t)b << 32) | a;
 354     }
 355 }
 356
 357 /**
 358  * load_atom_8_by_4:
 359  * @pv: host address
 360  *
 361  * Load 8 bytes from @pv, with two 4-byte atomic loads.
 362  */
 363 static inline uint64_t load_atom_8_by_4(void *pv)
 364 {
 365     uint32_t a = load_atomic4(pv);
 366     uint32_t b = load_atomic4(pv + 4);
 367
 368     if (HOST_BIG_ENDIAN) {
 369         return ((uint64_t)a << 32) | b;
 370     } else {
 371         return ((uint64_t)b << 32) | a;
 372     }
 373 }
 374
 375 /**
 376  * load_atom_8_by_8_or_4:
 377  * @pv: host address
 378  *
 379  * Load 8 bytes from aligned @pv, with at least 4-byte atomicity.
 380  */
 381 static inline uint64_t load_atom_8_by_8_or_4(void *pv)
 382 {
 383     if (HAVE_al8_fast) {
 384         return load_atomic8(pv);
 385     } else {
 386         return load_atom_8_by_4(pv);
 387     }
 388 }
 389
 390 /**
 391  * load_atom_2:
 392  * @p: host address
 393  * @memop: the full memory op
 394  *
 395  * Load 2 bytes from @p, honoring the atomicity of @memop.
 396  */
 397 static uint16_t load_atom_2(CPUState *cpu, uintptr_t ra,
 398                             void *pv, MemOp memop)
 399 {
 400     uintptr_t pi = (uintptr_t)pv;
 401     int atmax;
 402
 403     if (likely((pi & 1) == 0)) {
 404         return load_atomic2(pv);
 405     }
 406     if (HAVE_ATOMIC128_RO) {
 407         intptr_t left_in_page = -(pi | TARGET_PAGE_MASK);
 408         if (likely(left_in_page > 8)) {
 409             return load_atom_extract_al16_or_al8(pv, 2);
 410         }
 411     }
 412
 413     atmax = required_atomicity(cpu, pi, memop);
 414     switch (atmax) {
 415     case MO_8:
 416         return lduw_he_p(pv);
 417     case MO_16:
 418         /* The only case remaining is MO_ATOM_WITHIN16. */
 419         if (!HAVE_al8_fast && (pi & 3) == 1) {
 420             /* Big or little endian, we want the middle two bytes. */
 421             return load_atomic4(pv - 1) >> 8;
 422         }
 423         if ((pi & 15) != 7) {
 424             return load_atom_extract_al8_or_exit(cpu, ra, pv, 2);
 425         }
 426         return load_atom_extract_al16_or_exit(cpu, ra, pv, 2);
 427     default:
 428         g_assert_not_reached();
 429     }
 430 }
 431
 432 /**
 433  * load_atom_4:
 434  * @p: host address
 435  * @memop: the full memory op
 436  *
 437  * Load 4 bytes from @p, honoring the atomicity of @memop.
 438  */
 439 static uint32_t load_atom_4(CPUState *cpu, uintptr_t ra,
 440                             void *pv, MemOp memop)
 441 {
 442     uintptr_t pi = (uintptr_t)pv;
 443     int atmax;
 444
 445     if (likely((pi & 3) == 0)) {
 446         return load_atomic4(pv);
 447     }
 448     if (HAVE_ATOMIC128_RO) {
 449         intptr_t left_in_page = -(pi | TARGET_PAGE_MASK);
 450         if (likely(left_in_page > 8)) {
 451             return load_atom_extract_al16_or_al8(pv, 4);
 452         }
 453     }
 454
 455     atmax = required_atomicity(cpu, pi, memop);
 456     switch (atmax) {
 457     case MO_8:
 458     case MO_16:
 459     case -MO_16:
 460         /*
 461          * For MO_ATOM_IFALIGN, this is more atomicity than required,
 462          * but it's trivially supported on all hosts, better than 4
 463          * individual byte loads (when the host requires alignment),
 464          * and overlaps with the MO_ATOM_SUBALIGN case of p % 2 == 0.
 465          */
 466         return load_atom_extract_al4x2(pv);
 467     case MO_32:
 468         if (!(pi & 4)) {
 469             return load_atom_extract_al8_or_exit(cpu, ra, pv, 4);
 470         }
 471         return load_atom_extract_al16_or_exit(cpu, ra, pv, 4);
 472     default:
 473         g_assert_not_reached();
 474     }
 475 }
 476
 477 /**
 478  * load_atom_8:
 479  * @p: host address
 480  * @memop: the full memory op
 481  *
 482  * Load 8 bytes from @p, honoring the atomicity of @memop.
 483  */
 484 static uint64_t load_atom_8(CPUState *cpu, uintptr_t ra,
 485                             void *pv, MemOp memop)
 486 {
 487     uintptr_t pi = (uintptr_t)pv;
 488     int atmax;
 489
 490     /*
 491      * If the host does not support 8-byte atomics, wait until we have
 492      * examined the atomicity parameters below.
 493      */
 494     if (HAVE_al8 && likely((pi & 7) == 0)) {
 495         return load_atomic8(pv);
 496     }
 497     if (HAVE_ATOMIC128_RO) {
 498         return load_atom_extract_al16_or_al8(pv, 8);
 499     }
 500
 501     atmax = required_atomicity(cpu, pi, memop);
 502     if (atmax == MO_64) {
 503         if (!HAVE_al8 && (pi & 7) == 0) {
 504             load_atomic8_or_exit(cpu, ra, pv);
 505         }
 506         return load_atom_extract_al16_or_exit(cpu, ra, pv, 8);
 507     }
 508     if (HAVE_al8_fast) {
 509         return load_atom_extract_al8x2(pv);
 510     }
 511     switch (atmax) {
 512     case MO_8:
 513         return ldq_he_p(pv);
 514     case MO_16:
 515         return load_atom_8_by_2(pv);
 516     case MO_32:
 517         return load_atom_8_by_4(pv);
 518     case -MO_32:
 519         if (HAVE_al8) {
 520             return load_atom_extract_al8x2(pv);
 521         }
 522         cpu_loop_exit_atomic(cpu, ra);
 523     default:
 524         g_assert_not_reached();
 525     }
 526 }
 527
 528 /**
 529  * load_atom_16:
 530  * @p: host address
 531  * @memop: the full memory op
 532  *
 533  * Load 16 bytes from @p, honoring the atomicity of @memop.
 534  */
 535 static Int128 load_atom_16(CPUState *cpu, uintptr_t ra,
 536                            void *pv, MemOp memop)
 537 {
 538     uintptr_t pi = (uintptr_t)pv;
 539     int atmax;
 540     Int128 r;
 541     uint64_t a, b;
 542
 543     /*
 544      * If the host does not support 16-byte atomics, wait until we have
 545      * examined the atomicity parameters below.
 546      */
 547     if (HAVE_ATOMIC128_RO && likely((pi & 15) == 0)) {
 548         return atomic16_read_ro(pv);
 549     }
 550
 551     atmax = required_atomicity(cpu, pi, memop);
 552     switch (atmax) {
 553     case MO_8:
 554         memcpy(&r, pv, 16);
 555         return r;
 556     case MO_16:
 557         a = load_atom_8_by_2(pv);
 558         b = load_atom_8_by_2(pv + 8);
 559         break;
 560     case MO_32:
 561         a = load_atom_8_by_4(pv);
 562         b = load_atom_8_by_4(pv + 8);
 563         break;
 564     case MO_64:
 565         if (!HAVE_al8) {
 566             cpu_loop_exit_atomic(cpu, ra);
 567         }
 568         a = load_atomic8(pv);
 569         b = load_atomic8(pv + 8);
 570         break;
 571     case -MO_64:
 572         if (!HAVE_al8) {
 573             cpu_loop_exit_atomic(cpu, ra);
 574         }
 575         a = load_atom_extract_al8x2(pv);
 576         b = load_atom_extract_al8x2(pv + 8);
 577         break;
 578     case MO_128:
 579         return load_atomic16_or_exit(cpu, ra, pv);
 580     default:
 581         g_assert_not_reached();
 582     }
 583     return int128_make128(HOST_BIG_ENDIAN ? b : a, HOST_BIG_ENDIAN ? a : b);
 584 }
 585
 586 /**
 587  * store_atomic2:
 588  * @pv: host address
 589  * @val: value to store
 590  *
 591  * Atomically store 2 aligned bytes to @pv.
 592  */
 593 static inline void store_atomic2(void *pv, uint16_t val)
 594 {
 595     uint16_t *p = __builtin_assume_aligned(pv, 2);
 596     qatomic_set(p, val);
 597 }
 598
 599 /**
 600  * store_atomic4:
 601  * @pv: host address
 602  * @val: value to store
 603  *
 604  * Atomically store 4 aligned bytes to @pv.
 605  */
 606 static inline void store_atomic4(void *pv, uint32_t val)
 607 {
 608     uint32_t *p = __builtin_assume_aligned(pv, 4);
 609     qatomic_set(p, val);
 610 }
 611
 612 /**
 613  * store_atomic8:
 614  * @pv: host address
 615  * @val: value to store
 616  *
 617  * Atomically store 8 aligned bytes to @pv.
 618  */
 619 static inline void store_atomic8(void *pv, uint64_t val)
 620 {
 621     uint64_t *p = __builtin_assume_aligned(pv, 8);
 622
 623     qemu_build_assert(HAVE_al8);
 624     qatomic_set__nocheck(p, val);
 625 }
 626
 627 /**
 628  * store_atom_4x2
 629  */
 630 static inline void store_atom_4_by_2(void *pv, uint32_t val)
 631 {
 632     store_atomic2(pv, val >> (HOST_BIG_ENDIAN ? 16 : 0));
 633     store_atomic2(pv + 2, val >> (HOST_BIG_ENDIAN ? 0 : 16));
 634 }
 635
 636 /**
 637  * store_atom_8_by_2
 638  */
 639 static inline void store_atom_8_by_2(void *pv, uint64_t val)
 640 {
 641     store_atom_4_by_2(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0));
 642     store_atom_4_by_2(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32));
 643 }
 644
 645 /**
 646  * store_atom_8_by_4
 647  */
 648 static inline void store_atom_8_by_4(void *pv, uint64_t val)
 649 {
 650     store_atomic4(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0));
 651     store_atomic4(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32));
 652 }
 653
 654 /**
 655  * store_atom_insert_al4:
 656  * @p: host address
 657  * @val: shifted value to store
 658  * @msk: mask for value to store
 659  *
 660  * Atomically store @val to @p, masked by @msk.
 661  */
 662 static void store_atom_insert_al4(uint32_t *p, uint32_t val, uint32_t msk)
 663 {
 664     uint32_t old, new;
 665
 666     p = __builtin_assume_aligned(p, 4);
 667     old = qatomic_read(p);
 668     do {
 669         new = (old & ~msk) | val;
 670     } while (!__atomic_compare_exchange_n(p, &old, new, true,
 671                                           __ATOMIC_RELAXED, __ATOMIC_RELAXED));
 672 }
 673
 674 /**
 675  * store_atom_insert_al8:
 676  * @p: host address
 677  * @val: shifted value to store
 678  * @msk: mask for value to store
 679  *
 680  * Atomically store @val to @p masked by @msk.
 681  */
 682 static void store_atom_insert_al8(uint64_t *p, uint64_t val, uint64_t msk)
 683 {
 684     uint64_t old, new;
 685
 686     qemu_build_assert(HAVE_al8);
 687     p = __builtin_assume_aligned(p, 8);
 688     old = qatomic_read__nocheck(p);
 689     do {
 690         new = (old & ~msk) | val;
 691     } while (!__atomic_compare_exchange_n(p, &old, new, true,
 692                                           __ATOMIC_RELAXED, __ATOMIC_RELAXED));
 693 }
 694
 695 /**
 696  * store_bytes_leN:
 697  * @pv: host address
 698  * @size: number of bytes to store
 699  * @val_le: data to store
 700  *
 701  * Store @size bytes at @p.  The bytes to store are extracted in little-endian order
 702  * from @val_le; return the bytes of @val_le beyond @size that have not been stored.
 703  */
 704 static uint64_t store_bytes_leN(void *pv, int size, uint64_t val_le)
 705 {
 706     uint8_t *p = pv;
 707     for (int i = 0; i < size; i++, val_le >>= 8) {
 708         p[i] = val_le;
 709     }
 710     return val_le;
 711 }
 712
 713 /**
 714  * store_parts_leN
 715  * @pv: host address
 716  * @size: number of bytes to store
 717  * @val_le: data to store
 718  *
 719  * As store_bytes_leN, but atomically on each aligned part.
 720  */
 721 G_GNUC_UNUSED
 722 static uint64_t store_parts_leN(void *pv, int size, uint64_t val_le)
 723 {
 724     do {
 725         int n;
 726
 727         /* Find minimum of alignment and size */
 728         switch (((uintptr_t)pv | size) & 7) {
 729         case 4:
 730             store_atomic4(pv, le32_to_cpu(val_le));
 731             val_le >>= 32;
 732             n = 4;
 733             break;
 734         case 2:
 735         case 6:
 736             store_atomic2(pv, le16_to_cpu(val_le));
 737             val_le >>= 16;
 738             n = 2;
 739             break;
 740         default:
 741             *(uint8_t *)pv = val_le;
 742             val_le >>= 8;
 743             n = 1;
 744             break;
 745         case 0:
 746             g_assert_not_reached();
 747         }
 748         pv += n;
 749         size -= n;
 750     } while (size != 0);
 751
 752     return val_le;
 753 }
 754
 755 /**
 756  * store_whole_le4
 757  * @pv: host address
 758  * @size: number of bytes to store
 759  * @val_le: data to store
 760  *
 761  * As store_bytes_leN, but atomically as a whole.
 762  * Four aligned bytes are guaranteed to cover the store.
 763  */
 764 static uint64_t store_whole_le4(void *pv, int size, uint64_t val_le)
 765 {
 766     int sz = size * 8;
 767     int o = (uintptr_t)pv & 3;
 768     int sh = o * 8;
 769     uint32_t m = MAKE_64BIT_MASK(0, sz);
 770     uint32_t v;
 771
 772     if (HOST_BIG_ENDIAN) {
 773         v = bswap32(val_le) >> sh;
 774         m = bswap32(m) >> sh;
 775     } else {
 776         v = val_le << sh;
 777         m <<= sh;
 778     }
 779     store_atom_insert_al4(pv - o, v, m);
 780     return val_le >> sz;
 781 }
 782
 783 /**
 784  * store_whole_le8
 785  * @pv: host address
 786  * @size: number of bytes to store
 787  * @val_le: data to store
 788  *
 789  * As store_bytes_leN, but atomically as a whole.
 790  * Eight aligned bytes are guaranteed to cover the store.
 791  */
 792 static uint64_t store_whole_le8(void *pv, int size, uint64_t val_le)
 793 {
 794     int sz = size * 8;
 795     int o = (uintptr_t)pv & 7;
 796     int sh = o * 8;
 797     uint64_t m = MAKE_64BIT_MASK(0, sz);
 798     uint64_t v;
 799
 800     qemu_build_assert(HAVE_al8);
 801     if (HOST_BIG_ENDIAN) {
 802         v = bswap64(val_le) >> sh;
 803         m = bswap64(m) >> sh;
 804     } else {
 805         v = val_le << sh;
 806         m <<= sh;
 807     }
 808     store_atom_insert_al8(pv - o, v, m);
 809     return val_le >> sz;
 810 }
 811
 812 /**
 813  * store_whole_le16
 814  * @pv: host address
 815  * @size: number of bytes to store
 816  * @val_le: data to store
 817  *
 818  * As store_bytes_leN, but atomically as a whole.
 819  * 16 aligned bytes are guaranteed to cover the store.
 820  */
 821 static uint64_t store_whole_le16(void *pv, int size, Int128 val_le)
 822 {
 823     int sz = size * 8;
 824     int o = (uintptr_t)pv & 15;
 825     int sh = o * 8;
 826     Int128 m, v;
 827
 828     qemu_build_assert(HAVE_ATOMIC128_RW);
 829
 830     /* Like MAKE_64BIT_MASK(0, sz), but larger. */
 831     if (sz <= 64) {
 832         m = int128_make64(MAKE_64BIT_MASK(0, sz));
 833     } else {
 834         m = int128_make128(-1, MAKE_64BIT_MASK(0, sz - 64));
 835     }
 836
 837     if (HOST_BIG_ENDIAN) {
 838         v = int128_urshift(bswap128(val_le), sh);
 839         m = int128_urshift(bswap128(m), sh);
 840     } else {
 841         v = int128_lshift(val_le, sh);
 842         m = int128_lshift(m, sh);
 843     }
 844     store_atom_insert_al16(pv - o, v, m);
 845
 846     if (sz <= 64) {
 847         return 0;
 848     }
 849     return int128_gethi(val_le) >> (sz - 64);
 850 }
 851
 852 /**
 853  * store_atom_2:
 854  * @p: host address
 855  * @val: the value to store
 856  * @memop: the full memory op
 857  *
 858  * Store 2 bytes to @p, honoring the atomicity of @memop.
 859  */
 860 static void store_atom_2(CPUState *cpu, uintptr_t ra,
 861                          void *pv, MemOp memop, uint16_t val)
 862 {
 863     uintptr_t pi = (uintptr_t)pv;
 864     int atmax;
 865
 866     if (likely((pi & 1) == 0)) {
 867         store_atomic2(pv, val);
 868         return;
 869     }
 870
 871     atmax = required_atomicity(cpu, pi, memop);
 872     if (atmax == MO_8) {
 873         stw_he_p(pv, val);
 874         return;
 875     }
 876
 877     /*
 878      * The only case remaining is MO_ATOM_WITHIN16.
 879      * Big or little endian, we want the middle two bytes in each test.
 880      */
 881     if ((pi & 3) == 1) {
 882         store_atom_insert_al4(pv - 1, (uint32_t)val << 8, MAKE_64BIT_MASK(8, 16));
 883         return;
 884     } else if ((pi & 7) == 3) {
 885         if (HAVE_al8) {
 886             store_atom_insert_al8(pv - 3, (uint64_t)val << 24, MAKE_64BIT_MASK(24, 16));
 887             return;
 888         }
 889     } else if ((pi & 15) == 7) {
 890         if (HAVE_ATOMIC128_RW) {
 891             Int128 v = int128_lshift(int128_make64(val), 56);
 892             Int128 m = int128_lshift(int128_make64(0xffff), 56);
 893             store_atom_insert_al16(pv - 7, v, m);
 894             return;
 895         }
 896     } else {
 897         g_assert_not_reached();
 898     }
 899
 900     cpu_loop_exit_atomic(cpu, ra);
 901 }
 902
 903 /**
 904  * store_atom_4:
 905  * @p: host address
 906  * @val: the value to store
 907  * @memop: the full memory op
 908  *
 909  * Store 4 bytes to @p, honoring the atomicity of @memop.
 910  */
 911 static void store_atom_4(CPUState *cpu, uintptr_t ra,
 912                          void *pv, MemOp memop, uint32_t val)
 913 {
 914     uintptr_t pi = (uintptr_t)pv;
 915     int atmax;
 916
 917     if (likely((pi & 3) == 0)) {
 918         store_atomic4(pv, val);
 919         return;
 920     }
 921
 922     atmax = required_atomicity(cpu, pi, memop);
 923     switch (atmax) {
 924     case MO_8:
 925         stl_he_p(pv, val);
 926         return;
 927     case MO_16:
 928         store_atom_4_by_2(pv, val);
 929         return;
 930     case -MO_16:
 931         {
 932             uint32_t val_le = cpu_to_le32(val);
 933             int s2 = pi & 3;
 934             int s1 = 4 - s2;
 935
 936             switch (s2) {
 937             case 1:
 938                 val_le = store_whole_le4(pv, s1, val_le);
 939                 *(uint8_t *)(pv + 3) = val_le;
 940                 break;
 941             case 3:
 942                 *(uint8_t *)pv = val_le;
 943                 store_whole_le4(pv + 1, s2, val_le >> 8);
 944                 break;
 945             case 0: /* aligned */
 946             case 2: /* atmax MO_16 */
 947             default:
 948                 g_assert_not_reached();
 949             }
 950         }
 951         return;
 952     case MO_32:
 953         if ((pi & 7) < 4) {
 954             if (HAVE_al8) {
 955                 store_whole_le8(pv, 4, cpu_to_le32(val));
 956                 return;
 957             }
 958         } else {
 959             if (HAVE_ATOMIC128_RW) {
 960                 store_whole_le16(pv, 4, int128_make64(cpu_to_le32(val)));
 961                 return;
 962             }
 963         }
 964         cpu_loop_exit_atomic(cpu, ra);
 965     default:
 966         g_assert_not_reached();
 967     }
 968 }
 969
 970 /**
 971  * store_atom_8:
 972  * @p: host address
 973  * @val: the value to store
 974  * @memop: the full memory op
 975  *
 976  * Store 8 bytes to @p, honoring the atomicity of @memop.
 977  */
 978 static void store_atom_8(CPUState *cpu, uintptr_t ra,
 979                          void *pv, MemOp memop, uint64_t val)
 980 {
 981     uintptr_t pi = (uintptr_t)pv;
 982     int atmax;
 983
 984     if (HAVE_al8 && likely((pi & 7) == 0)) {
 985         store_atomic8(pv, val);
 986         return;
 987     }
 988
 989     atmax = required_atomicity(cpu, pi, memop);
 990     switch (atmax) {
 991     case MO_8:
 992         stq_he_p(pv, val);
 993         return;
 994     case MO_16:
 995         store_atom_8_by_2(pv, val);
 996         return;
 997     case MO_32:
 998         store_atom_8_by_4(pv, val);
 999         return;
1000     case -MO_32:
1001         if (HAVE_al8) {
1002             uint64_t val_le = cpu_to_le64(val);
1003             int s2 = pi & 7;
1004             int s1 = 8 - s2;
1005
1006             switch (s2) {
1007             case 1 ... 3:
1008                 val_le = store_whole_le8(pv, s1, val_le);
1009                 store_bytes_leN(pv + s1, s2, val_le);
1010                 break;
1011             case 5 ... 7:
1012                 val_le = store_bytes_leN(pv, s1, val_le);
1013                 store_whole_le8(pv + s1, s2, val_le);
1014                 break;
1015             case 0: /* aligned */
1016             case 4: /* atmax MO_32 */
1017             default:
1018                 g_assert_not_reached();
1019             }
1020             return;
1021         }
1022         break;
1023     case MO_64:
1024         if (HAVE_ATOMIC128_RW) {
1025             store_whole_le16(pv, 8, int128_make64(cpu_to_le64(val)));
1026             return;
1027         }
1028         break;
1029     default:
1030         g_assert_not_reached();
1031     }
1032     cpu_loop_exit_atomic(cpu, ra);
1033 }
1034
1035 /**
1036  * store_atom_16:
1037  * @p: host address
1038  * @val: the value to store
1039  * @memop: the full memory op
1040  *
1041  * Store 16 bytes to @p, honoring the atomicity of @memop.
1042  */
1043 static void store_atom_16(CPUState *cpu, uintptr_t ra,
1044                           void *pv, MemOp memop, Int128 val)
1045 {
1046     uintptr_t pi = (uintptr_t)pv;
1047     uint64_t a, b;
1048     int atmax;
1049
1050     if (HAVE_ATOMIC128_RW && likely((pi & 15) == 0)) {
1051         atomic16_set(pv, val);
1052         return;
1053     }
1054
1055     atmax = required_atomicity(cpu, pi, memop);
1056
1057     a = HOST_BIG_ENDIAN ? int128_gethi(val) : int128_getlo(val);
1058     b = HOST_BIG_ENDIAN ? int128_getlo(val) : int128_gethi(val);
1059     switch (atmax) {
1060     case MO_8:
1061         memcpy(pv, &val, 16);
1062         return;
1063     case MO_16:
1064         store_atom_8_by_2(pv, a);
1065         store_atom_8_by_2(pv + 8, b);
1066         return;
1067     case MO_32:
1068         store_atom_8_by_4(pv, a);
1069         store_atom_8_by_4(pv + 8, b);
1070         return;
1071     case MO_64:
1072         if (HAVE_al8) {
1073             store_atomic8(pv, a);
1074             store_atomic8(pv + 8, b);
1075             return;
1076         }
1077         break;
1078     case -MO_64:
1079         if (HAVE_ATOMIC128_RW) {
1080             uint64_t val_le;
1081             int s2 = pi & 15;
1082             int s1 = 16 - s2;
1083
1084             if (HOST_BIG_ENDIAN) {
1085                 val = bswap128(val);
1086             }
1087             switch (s2) {
1088             case 1 ... 7:
1089                 val_le = store_whole_le16(pv, s1, val);
1090                 store_bytes_leN(pv + s1, s2, val_le);
1091                 break;
1092             case 9 ... 15:
1093                 store_bytes_leN(pv, s1, int128_getlo(val));
1094                 val = int128_urshift(val, s1 * 8);
1095                 store_whole_le16(pv + s1, s2, val);
1096                 break;
1097             case 0: /* aligned */
1098             case 8: /* atmax MO_64 */
1099             default:
1100                 g_assert_not_reached();
1101             }
1102             return;
1103         }
1104         break;
1105     case MO_128:
1106         if (HAVE_ATOMIC128_RW) {
1107             atomic16_set(pv, val);
1108             return;
1109         }
1110         break;
1111     default:
1112         g_assert_not_reached();
1113     }
1114     cpu_loop_exit_atomic(cpu, ra);
1115 }