tests/tcg/hexagon/hvx_misc.c

   1 /*
   2  *  Copyright(c) 2021-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
   3  *
   4  *  This program is free software; you can redistribute it and/or modify
   5  *  it under the terms of the GNU General Public License as published by
   6  *  the Free Software Foundation; either version 2 of the License, or
   7  *  (at your option) any later version.
   8  *
   9  *  This program is distributed in the hope that it will be useful,
  10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  *  GNU General Public License for more details.
  13  *
  14  *  You should have received a copy of the GNU General Public License
  15  *  along with this program; if not, see <http://www.gnu.org/licenses/>.
  16  */
  17
  18 #include <stdio.h>
  19 #include <stdint.h>
  20 #include <stdbool.h>
  21 #include <string.h>
  22 #include <limits.h>
  23
  24 int err;
  25
  26 #include "hvx_misc.h"
  27
  28 static void test_load_tmp(void)
  29 {
  30     void *p0 = buffer0;
  31     void *p1 = buffer1;
  32     void *pout = output;
  33
  34     for (int i = 0; i < BUFSIZE; i++) {
  35         /*
  36          * Load into v12 as .tmp, then use it in the next packet
  37          * Should get the new value within the same packet and
  38          * the old value in the next packet
  39          */
  40         asm("v3 = vmem(%0 + #0)\n\t"
  41             "r1 = #1\n\t"
  42             "v12 = vsplat(r1)\n\t"
  43             "{\n\t"
  44             "    v12.tmp = vmem(%1 + #0)\n\t"
  45             "    v4.w = vadd(v12.w, v3.w)\n\t"
  46             "}\n\t"
  47             "v4.w = vadd(v4.w, v12.w)\n\t"
  48             "vmem(%2 + #0) = v4\n\t"
  49             : : "r"(p0), "r"(p1), "r"(pout)
  50             : "r1", "v12", "v3", "v4", "v6", "memory");
  51         p0 += sizeof(MMVector);
  52         p1 += sizeof(MMVector);
  53         pout += sizeof(MMVector);
  54
  55         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
  56             expect[i].w[j] = buffer0[i].w[j] + buffer1[i].w[j] + 1;
  57         }
  58     }
  59
  60     check_output_w(__LINE__, BUFSIZE);
  61 }
  62
  63 static void test_load_cur(void)
  64 {
  65     void *p0 = buffer0;
  66     void *pout = output;
  67
  68     for (int i = 0; i < BUFSIZE; i++) {
  69         asm("{\n\t"
  70             "    v2.cur = vmem(%0 + #0)\n\t"
  71             "    vmem(%1 + #0) = v2\n\t"
  72             "}\n\t"
  73             : : "r"(p0), "r"(pout) : "v2", "memory");
  74         p0 += sizeof(MMVector);
  75         pout += sizeof(MMVector);
  76
  77         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
  78             expect[i].uw[j] = buffer0[i].uw[j];
  79         }
  80     }
  81
  82     check_output_w(__LINE__, BUFSIZE);
  83 }
  84
  85 static void test_load_aligned(void)
  86 {
  87     /* Aligned loads ignore the low bits of the address */
  88     void *p0 = buffer0;
  89     void *pout = output;
  90     const size_t offset = 13;
  91
  92     p0 += offset;    /* Create an unaligned address */
  93     asm("v2 = vmem(%0 + #0)\n\t"
  94         "vmem(%1 + #0) = v2\n\t"
  95         : : "r"(p0), "r"(pout) : "v2", "memory");
  96
  97     expect[0] = buffer0[0];
  98
  99     check_output_w(__LINE__, 1);
 100 }
 101
 102 static void test_load_unaligned(void)
 103 {
 104     void *p0 = buffer0;
 105     void *pout = output;
 106     const size_t offset = 12;
 107
 108     p0 += offset;    /* Create an unaligned address */
 109     asm("v2 = vmemu(%0 + #0)\n\t"
 110         "vmem(%1 + #0) = v2\n\t"
 111         : : "r"(p0), "r"(pout) : "v2", "memory");
 112
 113     memcpy(expect, &buffer0[0].ub[offset], sizeof(MMVector));
 114
 115     check_output_w(__LINE__, 1);
 116 }
 117
 118 static void test_store_aligned(void)
 119 {
 120     /* Aligned stores ignore the low bits of the address */
 121     void *p0 = buffer0;
 122     void *pout = output;
 123     const size_t offset = 13;
 124
 125     pout += offset;    /* Create an unaligned address */
 126     asm("v2 = vmem(%0 + #0)\n\t"
 127         "vmem(%1 + #0) = v2\n\t"
 128         : : "r"(p0), "r"(pout) : "v2", "memory");
 129
 130     expect[0] = buffer0[0];
 131
 132     check_output_w(__LINE__, 1);
 133 }
 134
 135 static void test_store_unaligned(void)
 136 {
 137     void *p0 = buffer0;
 138     void *pout = output;
 139     const size_t offset = 12;
 140
 141     pout += offset;    /* Create an unaligned address */
 142     asm("v2 = vmem(%0 + #0)\n\t"
 143         "vmemu(%1 + #0) = v2\n\t"
 144         : : "r"(p0), "r"(pout) : "v2", "memory");
 145
 146     memcpy(expect, buffer0, 2 * sizeof(MMVector));
 147     memcpy(&expect[0].ub[offset], buffer0, sizeof(MMVector));
 148
 149     check_output_w(__LINE__, 2);
 150 }
 151
 152 static void test_masked_store(bool invert)
 153 {
 154     void *p0 = buffer0;
 155     void *pmask = mask;
 156     void *pout = output;
 157
 158     memset(expect, 0xff, sizeof(expect));
 159     memset(output, 0xff, sizeof(expect));
 160
 161     for (int i = 0; i < BUFSIZE; i++) {
 162         if (invert) {
 163             asm("r4 = #0\n\t"
 164                 "v4 = vsplat(r4)\n\t"
 165                 "v5 = vmem(%0 + #0)\n\t"
 166                 "q0 = vcmp.eq(v4.w, v5.w)\n\t"
 167                 "v5 = vmem(%1)\n\t"
 168                 "if (!q0) vmem(%2) = v5\n\t"             /* Inverted test */
 169                 : : "r"(pmask), "r"(p0), "r"(pout)
 170                 : "r4", "v4", "v5", "q0", "memory");
 171         } else {
 172             asm("r4 = #0\n\t"
 173                 "v4 = vsplat(r4)\n\t"
 174                 "v5 = vmem(%0 + #0)\n\t"
 175                 "q0 = vcmp.eq(v4.w, v5.w)\n\t"
 176                 "v5 = vmem(%1)\n\t"
 177                 "if (q0) vmem(%2) = v5\n\t"             /* Non-inverted test */
 178                 : : "r"(pmask), "r"(p0), "r"(pout)
 179                 : "r4", "v4", "v5", "q0", "memory");
 180         }
 181         p0 += sizeof(MMVector);
 182         pmask += sizeof(MMVector);
 183         pout += sizeof(MMVector);
 184
 185         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
 186             if (invert) {
 187                 if (i + j % MASKMOD != 0) {
 188                     expect[i].w[j] = buffer0[i].w[j];
 189                 }
 190             } else {
 191                 if (i + j % MASKMOD == 0) {
 192                     expect[i].w[j] = buffer0[i].w[j];
 193                 }
 194             }
 195         }
 196     }
 197
 198     check_output_w(__LINE__, BUFSIZE);
 199 }
 200
 201 static void test_new_value_store(void)
 202 {
 203     void *p0 = buffer0;
 204     void *pout = output;
 205
 206     asm("{\n\t"
 207         "    v2 = vmem(%0 + #0)\n\t"
 208         "    vmem(%1 + #0) = v2.new\n\t"
 209         "}\n\t"
 210         : : "r"(p0), "r"(pout) : "v2", "memory");
 211
 212     expect[0] = buffer0[0];
 213
 214     check_output_w(__LINE__, 1);
 215 }
 216
 217 static void test_max_temps()
 218 {
 219     void *p0 = buffer0;
 220     void *pout = output;
 221
 222     asm("v0 = vmem(%0 + #0)\n\t"
 223         "v1 = vmem(%0 + #1)\n\t"
 224         "v2 = vmem(%0 + #2)\n\t"
 225         "v3 = vmem(%0 + #3)\n\t"
 226         "v4 = vmem(%0 + #4)\n\t"
 227         "{\n\t"
 228         "    v1:0.w = vadd(v3:2.w, v1:0.w)\n\t"
 229         "    v2.b = vshuffe(v3.b, v2.b)\n\t"
 230         "    v3.w = vadd(v1.w, v4.w)\n\t"
 231         "    v4.tmp = vmem(%0 + #5)\n\t"
 232         "}\n\t"
 233         "vmem(%1 + #0) = v0\n\t"
 234         "vmem(%1 + #1) = v1\n\t"
 235         "vmem(%1 + #2) = v2\n\t"
 236         "vmem(%1 + #3) = v3\n\t"
 237         "vmem(%1 + #4) = v4\n\t"
 238         : : "r"(p0), "r"(pout) : "memory");
 239
 240         /* The first two vectors come from the vadd-pair instruction */
 241         for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
 242             expect[0].w[i] = buffer0[0].w[i] + buffer0[2].w[i];
 243             expect[1].w[i] = buffer0[1].w[i] + buffer0[3].w[i];
 244         }
 245         /* The third vector comes from the vshuffe instruction */
 246         for (int i = 0; i < MAX_VEC_SIZE_BYTES / 2; i++) {
 247             expect[2].uh[i] = (buffer0[2].uh[i] & 0xff) |
 248                               (buffer0[3].uh[i] & 0xff) << 8;
 249         }
 250         /* The fourth vector comes from the vadd-single instruction */
 251         for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
 252             expect[3].w[i] = buffer0[1].w[i] + buffer0[5].w[i];
 253         }
 254         /*
 255          * The fifth vector comes from the load to v4
 256          * make sure the .tmp is dropped
 257          */
 258         expect[4] = buffer0[4];
 259
 260         check_output_b(__LINE__, 5);
 261 }
 262
 263 TEST_VEC_OP2(vadd_w, vadd, .w, w, 4, +)
 264 TEST_VEC_OP2(vadd_h, vadd, .h, h, 2, +)
 265 TEST_VEC_OP2(vadd_b, vadd, .b, b, 1, +)
 266 TEST_VEC_OP2(vsub_w, vsub, .w, w, 4, -)
 267 TEST_VEC_OP2(vsub_h, vsub, .h, h, 2, -)
 268 TEST_VEC_OP2(vsub_b, vsub, .b, b, 1, -)
 269 TEST_VEC_OP2(vxor, vxor, , d, 8, ^)
 270 TEST_VEC_OP2(vand, vand, , d, 8, &)
 271 TEST_VEC_OP2(vor, vor, , d, 8, |)
 272 TEST_VEC_OP1(vnot, vnot, , d, 8, ~)
 273
 274 TEST_PRED_OP2(pred_or, or, |, "")
 275 TEST_PRED_OP2(pred_or_n, or, |, "!")
 276 TEST_PRED_OP2(pred_and, and, &, "")
 277 TEST_PRED_OP2(pred_and_n, and, &, "!")
 278 TEST_PRED_OP2(pred_xor, xor, ^, "")
 279
 280 static void test_vadduwsat(void)
 281 {
 282     /*
 283      * Test for saturation by adding two numbers that add to more than UINT_MAX
 284      * and make sure the result saturates to UINT_MAX
 285      */
 286     const uint32_t x = 0xffff0000;
 287     const uint32_t y = 0x000fffff;
 288
 289     memset(expect, 0x12, sizeof(MMVector));
 290     memset(output, 0x34, sizeof(MMVector));
 291
 292     asm volatile ("v10 = vsplat(%0)\n\t"
 293                   "v11 = vsplat(%1)\n\t"
 294                   "v21.uw = vadd(v11.uw, v10.uw):sat\n\t"
 295                   "vmem(%2+#0) = v21\n\t"
 296                   : /* no outputs */
 297                   : "r"(x), "r"(y), "r"(output)
 298                   : "v10", "v11", "v21", "memory");
 299
 300     for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
 301         expect[0].uw[j] = UINT_MAX;
 302     }
 303
 304     check_output_w(__LINE__, 1);
 305 }
 306
 307 static void test_vsubuwsat_dv(void)
 308 {
 309     /*
 310      * Test for saturation by subtracting two numbers where the result is
 311      * negative and make sure the result saturates to zero
 312      *
 313      * vsubuwsat_dv operates on an HVX register pair, so we'll have a
 314      * pair of subtractions
 315      *     w - x < 0
 316      *     y - z < 0
 317      */
 318     const uint32_t w = 0x000000b7;
 319     const uint32_t x = 0xffffff4e;
 320     const uint32_t y = 0x31fe88e7;
 321     const uint32_t z = 0x7fffff79;
 322
 323     memset(expect, 0x12, sizeof(MMVector) * 2);
 324     memset(output, 0x34, sizeof(MMVector) * 2);
 325
 326     asm volatile ("v16 = vsplat(%0)\n\t"
 327                   "v17 = vsplat(%1)\n\t"
 328                   "v26 = vsplat(%2)\n\t"
 329                   "v27 = vsplat(%3)\n\t"
 330                   "v25:24.uw = vsub(v17:16.uw, v27:26.uw):sat\n\t"
 331                   "vmem(%4+#0) = v24\n\t"
 332                   "vmem(%4+#1) = v25\n\t"
 333                   : /* no outputs */
 334                   : "r"(w), "r"(y), "r"(x), "r"(z), "r"(output)
 335                   : "v16", "v17", "v24", "v25", "v26", "v27", "memory");
 336
 337     for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
 338         expect[0].uw[j] = 0x00000000;
 339         expect[1].uw[j] = 0x00000000;
 340     }
 341
 342     check_output_w(__LINE__, 2);
 343 }
 344
 345 static void test_vshuff(void)
 346 {
 347     /* Test that vshuff works when the two operands are the same register */
 348     const uint32_t splat = 0x089be55c;
 349     const uint32_t shuff = 0x454fa926;
 350     MMVector v0, v1;
 351
 352     memset(expect, 0x12, sizeof(MMVector));
 353     memset(output, 0x34, sizeof(MMVector));
 354
 355     asm volatile("v25 = vsplat(%0)\n\t"
 356                  "vshuff(v25, v25, %1)\n\t"
 357                  "vmem(%2 + #0) = v25\n\t"
 358                  : /* no outputs */
 359                  : "r"(splat), "r"(shuff), "r"(output)
 360                  : "v25", "memory");
 361
 362     /*
 363      * The semantics of Hexagon are the operands are pass-by-value, so create
 364      * two copies of the vsplat result.
 365      */
 366     for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
 367         v0.uw[i] = splat;
 368         v1.uw[i] = splat;
 369     }
 370     /* Do the vshuff operation */
 371     for (int offset = 1; offset < MAX_VEC_SIZE_BYTES; offset <<= 1) {
 372         if (shuff & offset) {
 373             for (int k = 0; k < MAX_VEC_SIZE_BYTES; k++) {
 374                 if (!(k & offset)) {
 375                     uint8_t tmp = v0.ub[k];
 376                     v0.ub[k] = v1.ub[k + offset];
 377                     v1.ub[k + offset] = tmp;
 378                 }
 379             }
 380         }
 381     }
 382     /* Put the result in the expect buffer for verification */
 383     expect[0] = v1;
 384
 385     check_output_b(__LINE__, 1);
 386 }
 387
 388 static void test_load_tmp_predicated(void)
 389 {
 390     void *p0 = buffer0;
 391     void *p1 = buffer1;
 392     void *pout = output;
 393     bool pred = true;
 394
 395     for (int i = 0; i < BUFSIZE; i++) {
 396         /*
 397          * Load into v12 as .tmp with a predicate
 398          * When the predicate is true, we get the vector from buffer1[i]
 399          * When the predicate is false, we get a vector of all 1's
 400          * Regardless of the predicate, the next packet should have
 401          * a vector of all 1's
 402          */
 403         asm("v3 = vmem(%0 + #0)\n\t"
 404             "r1 = #1\n\t"
 405             "v12 = vsplat(r1)\n\t"
 406             "p1 = !cmp.eq(%3, #0)\n\t"
 407             "{\n\t"
 408             "    if (p1) v12.tmp = vmem(%1 + #0)\n\t"
 409             "    v4.w = vadd(v12.w, v3.w)\n\t"
 410             "}\n\t"
 411             "v4.w = vadd(v4.w, v12.w)\n\t"
 412             "vmem(%2 + #0) = v4\n\t"
 413             : : "r"(p0), "r"(p1), "r"(pout), "r"(pred)
 414             : "r1", "p1", "v12", "v3", "v4", "v6", "memory");
 415         p0 += sizeof(MMVector);
 416         p1 += sizeof(MMVector);
 417         pout += sizeof(MMVector);
 418
 419         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
 420             expect[i].w[j] =
 421                 pred ? buffer0[i].w[j] + buffer1[i].w[j] + 1
 422                      : buffer0[i].w[j] + 2;
 423         }
 424         pred = !pred;
 425     }
 426
 427     check_output_w(__LINE__, BUFSIZE);
 428 }
 429
 430 static void test_load_cur_predicated(void)
 431 {
 432     bool pred = true;
 433     for (int i = 0; i < BUFSIZE; i++) {
 434         asm volatile("p0 = !cmp.eq(%3, #0)\n\t"
 435                      "v3 = vmem(%0+#0)\n\t"
 436                      /*
 437                       * Preload v4 to make sure that the assignment from the
 438                       * packet below is not being ignored when pred is false.
 439                       */
 440                      "r0 = #0x01237654\n\t"
 441                      "v4 = vsplat(r0)\n\t"
 442                      "{\n\t"
 443                      "    if (p0) v3.cur = vmem(%1+#0)\n\t"
 444                      "    v4 = v3\n\t"
 445                      "}\n\t"
 446                      "vmem(%2+#0) = v4\n\t"
 447                      :
 448                      : "r"(&buffer0[i]), "r"(&buffer1[i]),
 449                        "r"(&output[i]), "r"(pred)
 450                      : "r0", "p0", "v3", "v4", "memory");
 451         expect[i] = pred ? buffer1[i] : buffer0[i];
 452         pred = !pred;
 453     }
 454     check_output_w(__LINE__, BUFSIZE);
 455 }
 456
 457 int main()
 458 {
 459     init_buffers();
 460
 461     test_load_tmp();
 462     test_load_cur();
 463     test_load_aligned();
 464     test_load_unaligned();
 465     test_store_aligned();
 466     test_store_unaligned();
 467     test_masked_store(false);
 468     test_masked_store(true);
 469     test_new_value_store();
 470     test_max_temps();
 471
 472     test_vadd_w();
 473     test_vadd_h();
 474     test_vadd_b();
 475     test_vsub_w();
 476     test_vsub_h();
 477     test_vsub_b();
 478     test_vxor();
 479     test_vand();
 480     test_vor();
 481     test_vnot();
 482
 483     test_pred_or(false);
 484     test_pred_or_n(true);
 485     test_pred_and(false);
 486     test_pred_and_n(true);
 487     test_pred_xor(false);
 488
 489     test_vadduwsat();
 490     test_vsubuwsat_dv();
 491
 492     test_vshuff();
 493
 494     test_load_tmp_predicated();
 495     test_load_cur_predicated();
 496
 497     puts(err ? "FAIL" : "PASS");
 498     return err ? 1 : 0;
 499 }