From 9ae83078fe45d093bbaa02b8348f2407fe0c62d6 Mon Sep 17 00:00:00 2001 From: Robin Dapp Date: Mon, 15 Jan 2024 17:34:58 +0100 Subject: [PATCH] RISC-V: Adjust vec unit-stride load/store costs. Scalar loads provide offset addressing while unit-stride vector instructions cannot. The offset must be loaded into a general-purpose register before it can be used. In order to account for this, this patch adds an address arithmetic heuristic that keeps track of data reference operands. If we haven't seen the operand before we add the cost of a scalar statement. This helps to get rid of an lbm regression when vectorizing (roughly 0.5% fewer dynamic instructions). gcc5 improves by 0.2% and deepsjeng by 0.25%. wrf and nab degrade by 0.1%. This is because before we now adjust the cost of SLP as well as loop-vectorized instructions whereas we would only adjust loop-vectorized instructions before. Considering higher scalar_to_vec costs (3 vs 1) for all vectorization types causes some snippets not to get vectorized anymore. Given these costs the decision looks correct but appears worse when just counting dynamic instructions. In total SPECint 2017 has 4 bln dynamic instructions less and SPECfp 0.7 bln. gcc/ChangeLog: * config/riscv/riscv-vector-costs.cc (adjust_stmt_cost): Move... (costs::adjust_stmt_cost): ... to here and add vec_load/vec_store offset handling. (costs::add_stmt_cost): Also adjust cost for statements without stmt_info. * config/riscv/riscv-vector-costs.h: Define zero constant. gcc/testsuite/ChangeLog: * gcc.dg/vect/costmodel/riscv/rvv/vse-slp-1.c: New test. * gcc.dg/vect/costmodel/riscv/rvv/vse-slp-2.c: New test. --- gcc/config/riscv/riscv-vector-costs.cc | 86 +++++++++++++++++++--- gcc/config/riscv/riscv-vector-costs.h | 10 +++ .../gcc.dg/vect/costmodel/riscv/rvv/vse-slp-1.c | 51 +++++++++++++ .../gcc.dg/vect/costmodel/riscv/rvv/vse-slp-2.c | 51 +++++++++++++ 4 files changed, 188 insertions(+), 10 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vse-slp-1.c create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vse-slp-2.c diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index 7c9840df4e9..adf9c197df5 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -42,6 +42,7 @@ along with GCC; see the file COPYING3. If not see #include "backend.h" #include "tree-data-ref.h" #include "tree-ssa-loop-niter.h" +#include "tree-hash-traits.h" /* This file should be included last. */ #include "riscv-vector-costs.h" @@ -1047,18 +1048,81 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const top of riscv_builtin_vectorization_cost handling which doesn't have any information on statement operation codes etc. */ -static unsigned -adjust_stmt_cost (enum vect_cost_for_stmt kind, tree vectype, int stmt_cost) +unsigned +costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop, + stmt_vec_info stmt_info, + slp_tree, tree vectype, int stmt_cost) { const cpu_vector_cost *costs = get_vector_costs (); switch (kind) { case scalar_to_vec: - return stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->FR2VR - : costs->regmove->GR2VR); + stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->FR2VR + : costs->regmove->GR2VR); + break; case vec_to_scalar: - return stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->VR2FR - : costs->regmove->VR2GR); + stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->VR2FR + : costs->regmove->VR2GR); + break; + case vector_load: + case vector_store: + { + /* Unit-stride vector loads and stores do not have offset addressing + as opposed to scalar loads and stores. + If the address depends on a variable we need an additional + add/sub for each load/store in the worst case. */ + if (stmt_info && stmt_info->stmt) + { + data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); + class loop *father = stmt_info->stmt->bb->loop_father; + if (!loop && father && !father->inner && father->superloops) + { + tree ref; + if (TREE_CODE (dr->ref) != MEM_REF + || !(ref = TREE_OPERAND (dr->ref, 0)) + || TREE_CODE (ref) != SSA_NAME) + break; + + if (SSA_NAME_IS_DEFAULT_DEF (ref)) + break; + + if (memrefs.contains ({ref, cst0})) + break; + + memrefs.add ({ref, cst0}); + + /* In case we have not seen REF before and the base address + is a pointer operation try a bit harder. */ + tree base = DR_BASE_ADDRESS (dr); + if (TREE_CODE (base) == POINTER_PLUS_EXPR + || TREE_CODE (base) == POINTER_DIFF_EXPR) + { + /* Deconstruct BASE's first operand. If it is a binary + operation, i.e. a base and an "offset" store this + pair. Only increase the stmt_cost if we haven't seen + it before. */ + tree argp = TREE_OPERAND (base, 1); + typedef std::pair addr_pair; + addr_pair pair; + if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary) + { + tree argp0 = tree_strip_nop_conversions + (TREE_OPERAND (argp, 0)); + tree argp1 = TREE_OPERAND (argp, 1); + pair = addr_pair (argp0, argp1); + if (memrefs.contains (pair)) + break; + + memrefs.add (pair); + stmt_cost += builtin_vectorization_cost (scalar_stmt, + NULL_TREE, 0); + } + } + } + } + break; + } + default: break; } @@ -1067,7 +1131,7 @@ adjust_stmt_cost (enum vect_cost_for_stmt kind, tree vectype, int stmt_cost) unsigned costs::add_stmt_cost (int count, vect_cost_for_stmt kind, - stmt_vec_info stmt_info, slp_tree, tree vectype, + stmt_vec_info stmt_info, slp_tree node, tree vectype, int misalign, vect_cost_model_location where) { int stmt_cost @@ -1080,6 +1144,7 @@ costs::add_stmt_cost (int count, vect_cost_for_stmt kind, if (loop_vinfo) analyze_loop_vinfo (loop_vinfo); + memrefs.empty (); m_analyzed_vinfo = true; } @@ -1092,11 +1157,12 @@ costs::add_stmt_cost (int count, vect_cost_for_stmt kind, as one iteration of the VLA loop. */ if (where == vect_body && m_unrolled_vls_niters) m_unrolled_vls_stmts += count * m_unrolled_vls_niters; - - if (vectype) - stmt_cost = adjust_stmt_cost (kind, vectype, stmt_cost); } + if (vectype) + stmt_cost = adjust_stmt_cost (kind, loop_vinfo, stmt_info, node, vectype, + stmt_cost); + return record_stmt_cost (stmt_info, where, count * stmt_cost); } diff --git a/gcc/config/riscv/riscv-vector-costs.h b/gcc/config/riscv/riscv-vector-costs.h index 4e2bbfd5ca9..ca0ef1199b2 100644 --- a/gcc/config/riscv/riscv-vector-costs.h +++ b/gcc/config/riscv/riscv-vector-costs.h @@ -85,6 +85,12 @@ private: unsigned HOST_WIDE_INT m_unrolled_vls_niters = 0; unsigned HOST_WIDE_INT m_unrolled_vls_stmts = 0; + tree cst0 = build_int_cst (integer_type_node, 0); + + /* Store the memory references already processed. */ + typedef pair_hash tree_pair_hash; + hash_set memrefs; + void analyze_loop_vinfo (loop_vec_info); void record_potential_vls_unrolling (loop_vec_info); bool prefer_unrolled_loop () const; @@ -98,6 +104,10 @@ private: void record_potential_unexpected_spills (loop_vec_info); void adjust_vect_cost_per_loop (loop_vec_info); + unsigned adjust_stmt_cost (enum vect_cost_for_stmt kind, + loop_vec_info, + stmt_vec_info stmt_info, slp_tree, + tree vectype, int stmt_cost); }; } // namespace riscv_vector diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vse-slp-1.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vse-slp-1.c new file mode 100644 index 00000000000..530146a6d31 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vse-slp-1.c @@ -0,0 +1,51 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -fdump-tree-slp1-details" } */ + +#define f1 (1.0 / 3.0) +#define f2 (1.0 / 18.0) +#define f3 (1.0 / 36.0) + +#define SIZE_X 10 +#define SIZE_Y 10 +#define SIZE_Z 10 + +typedef enum {C = 0, + N, S, E, W, T, B, + NE, NW, SE, SW, + NT, NB, ST, SB, + ET, EB, WT, WB, + FLAGS, N_CELL_ENTRIES} CELL_ENTRIES; + +#define CALC_INDEX(x,y,z,e) ((e)+N_CELL_ENTRIES*((x)+ \ + (y)*SIZE_X+(z)*SIZE_X*SIZE_Y)) +#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX(dx, dy, dz, e)+(i)]) +#define LOCAL(g,e) (GRID_ENTRY_SWEEP (g, 0, 0, 0, e)) + +void foo (double *grid) +{ + for( int i = CALC_INDEX(0, 0, -2, 0); \ + i < CALC_INDEX(0, 0, SIZE_Z + 2, 0); \ + i += N_CELL_ENTRIES ) { + LOCAL (grid, C ) = f1; + LOCAL (grid, N ) = f2; + LOCAL (grid, S ) = f2; + LOCAL (grid, E ) = f2; + LOCAL (grid, W ) = f2; + LOCAL (grid, T ) = f2; + LOCAL (grid, B ) = f2; + LOCAL (grid, NE) = f3; + LOCAL (grid, NW) = f3; + LOCAL (grid, SE) = f3; + LOCAL (grid, SW) = f3; + LOCAL (grid, NT) = f3; + LOCAL (grid, NB) = f3; + LOCAL (grid, ST) = f3; + LOCAL (grid, SB) = f3; + LOCAL (grid, ET) = f3; + LOCAL (grid, EB) = f3; + LOCAL (grid, WT) = f3; + LOCAL (grid, WB) = f3; + } +} + +/* { dg-final { scan-tree-dump-times "vectorized using SLP" 0 "slp1" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vse-slp-2.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vse-slp-2.c new file mode 100644 index 00000000000..7650a0e40fc --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vse-slp-2.c @@ -0,0 +1,51 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -fdump-tree-slp1-details" } */ + +#define f1 3 +#define f2 4 +#define f3 5 + +#define SIZE_X 10 +#define SIZE_Y 10 +#define SIZE_Z 10 + +typedef enum {C = 0, + N, S, E, W, T, B, + NE, NW, SE, SW, + NT, NB, ST, SB, + ET, EB, WT, WB, + FLAGS, N_CELL_ENTRIES} CELL_ENTRIES; + +#define CALC_INDEX(x,y,z,e) ((e)+N_CELL_ENTRIES*((x)+ \ + (y)*SIZE_X+(z)*SIZE_X*SIZE_Y)) +#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX(dx, dy, dz, e)+(i)]) +#define LOCAL(g,e) (GRID_ENTRY_SWEEP (g, 0, 0, 0, e)) + +void foo (unsigned long *grid) +{ + for( int i = CALC_INDEX(0, 0, -2, 0); \ + i < CALC_INDEX(0, 0, SIZE_Z + 2, 0); \ + i += N_CELL_ENTRIES ) { + LOCAL (grid, C ) = f1; + LOCAL (grid, N ) = f2; + LOCAL (grid, S ) = f2; + LOCAL (grid, E ) = f2; + LOCAL (grid, W ) = f2; + LOCAL (grid, T ) = f2; + LOCAL (grid, B ) = f2; + LOCAL (grid, NE) = f3; + LOCAL (grid, NW) = f3; + LOCAL (grid, SE) = f3; + LOCAL (grid, SW) = f3; + LOCAL (grid, NT) = f3; + LOCAL (grid, NB) = f3; + LOCAL (grid, ST) = f3; + LOCAL (grid, SB) = f3; + LOCAL (grid, ET) = f3; + LOCAL (grid, EB) = f3; + LOCAL (grid, WT) = f3; + LOCAL (grid, WB) = f3; + } +} + +/* { dg-final { scan-tree-dump-times "vectorized using SLP" 0 "slp1" } } */ -- 2.11.4.GIT