From 66368bf4bc95af685d7ce30ef5833f4e6079c81b Mon Sep 17 00:00:00 2001 From: davidxl Date: Wed, 28 Jul 2010 19:13:11 +0000 Subject: [PATCH] IVOPT performance tuning patch. The main problem is a variant of maximal weight bipartite matching/assignment problem -- i.e., there is an additional global cost function. The complexity of the algorighm to find the optimial solution > O(n^2). The existing algorithm in gcc tries to find the solution in 3 stages: 1) Find the initial solution set (dynamic programing style) 2) Extend the solution set 3) Prune the soultion set. The problem is that in step 1, the initial set tends to be too large so that the final solution is very likely local optimal. This patch addresses the problem and sees very large SPEC improvements. Another area of problem is that ivopts often creates loop invariant expressions, and such expressions increase register pressure which is not counted. This is addressed in this patch. The third main problem is the profile data is not considered in cost computation The forth problem is that loop invariant comptuation's cost is not properly adjusted. There are more tuning opportuties, namely: 1) Do not check ivs dependency during ivs set pruning (this improves deallII 8% on core2) 2) Unconditionally consider all important candidates in partial set expansion (in addition to the extended solutino based on selected candidates) 3) revisit the two stage initial set computation. git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@162653 138bc75d-0d04-0410-961f-82ee72b054a4 --- gcc/ChangeLog | 34 ++ gcc/testsuite/gcc.dg/tree-ssa/ivopt_1.c | 18 ++ gcc/testsuite/gcc.dg/tree-ssa/ivopt_2.c | 17 + gcc/testsuite/gcc.dg/tree-ssa/ivopt_3.c | 20 ++ gcc/testsuite/gcc.dg/tree-ssa/ivopt_4.c | 19 ++ gcc/testsuite/gcc.dg/tree-ssa/ivopt_infer_1.c | 25 ++ gcc/testsuite/gcc.dg/tree-ssa/ivopt_infer_2.c | 25 ++ gcc/testsuite/gcc.dg/tree-ssa/ivopt_mult_1.c | 24 ++ gcc/testsuite/gcc.dg/tree-ssa/ivopt_mult_2.c | 25 ++ gcc/testsuite/gcc.dg/tree-ssa/ivopt_mult_3.c | 22 ++ gcc/testsuite/gcc.dg/tree-ssa/ivopt_mult_4.c | 25 ++ gcc/testsuite/gcc.dg/tree-ssa/ivopts-3.c | 2 +- gcc/tree-ssa-loop-ivopts.c | 430 ++++++++++++++++++++++---- 13 files changed, 628 insertions(+), 58 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ivopt_1.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ivopt_2.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ivopt_3.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ivopt_4.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ivopt_infer_1.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ivopt_infer_2.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ivopt_mult_1.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ivopt_mult_2.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ivopt_mult_3.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ivopt_mult_4.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index d5f931dca59..10059b05a66 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,37 @@ +2010-07-28 Xinliang David Li + + * tree-ssa-loop-ivopts.c (avg_loop_niter): New function. + (dump_cand): Dump var_before/after. + (htab_inv_expr_eq): New function. + (htab_inv_expr_hash): New function. + (tree_ssa_iv_optimize_init): Support pseudo invariants. + (add_candidate_1): consider base type precision. + (set_use_iv_cost): New parameter. + (adjust_setup_cost): Use profile information. + (get_address_cost): Do not hard code width in computing address + offset limits. + (compare_aff_trees): New function. + (get_loop_invariant_expr_id): New function. + (get_computation_cost_at): New parameter and use profile information. + (get_computation_cost): New parameter. + (determine_use_iv_cost_generic): Pass new parameter. + (determine_use_iv_cost_address): Ditto. + (determine_use_iv_cost_condition): Ditto. + (autoinc_possible_for_pair): Ditto. + (determine_use_iv_costs): More dumps. + (iv_ca_get_num_inv_exprs): New function. + (iv_ca_recount_cost): Consider loop invariants in register pressure + cost. + (iv_ca_add_use): New parameter. + (iv_ca_dump): Better dumping. + (iv_ca_extend): New parameter. + (try_add_cand_for): Attempt to get better partial solution. + (try_improve_iv_set): Pass new parameter to iv_ca_extend. + (create_new-ivs): More dumps. + (rewrite_use_compare): Ditto. + (free_loop_data): More cleanup. + (treee_ssa_iv_optimize_finalize): Ditto. + 2010-07-28 Kai Tietz * config/i386/i386.h (MCOUNT_NAME_BEFORE_PROLOGUE): New. diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ivopt_1.c b/gcc/testsuite/gcc.dg/tree-ssa/ivopt_1.c new file mode 100644 index 00000000000..60baa4bd336 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ivopt_1.c @@ -0,0 +1,18 @@ +/* { dg-do compile { target {{ i?86-*-* x86_64-*-* } && lp64 } } } */ +/* { dg-options "-O2 -m64 -fdump-tree-ivopts" } */ +#define TYPE char* + +/* Testing that only one induction variable is selected after IVOPT on + the given target instead of 3. */ +void foo (int i_width, TYPE dst, TYPE src1, TYPE src2) +{ + int x; + for( x = 0; x < i_width; x++ ) + { + dst[x] = ( src1[x] + src2[x] + 1 ) >> 1; + } +} + + +/* { dg-final { scan-tree-dump-times "PHI > 1; + } +} + +/* { dg-final { scan-tree-dump-times "PHI > 1; + dst+=sizeof(TYPE); + src1+=sizeof(TYPE); + src2+=sizeof(TYPE); + } +} + +/* { dg-final { scan-tree-dump-times "PHI > 1; + } +} + +/* { dg-final { scan-tree-dump-times "PHI > 1; + dst++; + i += 16; + } +} + +/* { dg-final { scan-tree-dump-times "Replacing" 1 "ivopts"} } */ +/* { dg-final { cleanup-tree-dump "ivopts" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ivopt_infer_2.c b/gcc/testsuite/gcc.dg/tree-ssa/ivopt_infer_2.c new file mode 100644 index 00000000000..4b7e197dd04 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ivopt_infer_2.c @@ -0,0 +1,25 @@ +/* { dg-do compile { target {{ i?86-*-* x86_64-*-* } && lp64 } } } */ +/* { dg-options "-O2 -m64 -fdump-tree-ivopts-details" } */ + +#ifndef TYPE +#define TYPE char* +#endif + +extern int a[]; + +/* Can not infer loop iteration from array -- exit test can not be replaced. */ +void foo (int i_width, TYPE dst, TYPE src1, TYPE src2) +{ + TYPE dstn= dst + i_width; + TYPE dst0 = dst; + unsigned long long i = 0; + for( ; dst <= dstn; ) + { + dst0[i] = ( src1[i] + src2[i] + 1 +a[i]) >> 1; + dst++; + i += 16; + } +} + +/* { dg-final { scan-tree-dump-times "Replacing" 0 "ivopts"} } */ +/* { dg-final { cleanup-tree-dump "ivopts" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ivopt_mult_1.c b/gcc/testsuite/gcc.dg/tree-ssa/ivopt_mult_1.c new file mode 100644 index 00000000000..4e19dfd01e5 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ivopt_mult_1.c @@ -0,0 +1,24 @@ +/* { dg-do compile { target {{ i?86-*-* x86_64-*-* } && lp64 } } } */ +/* { dg-options "-O2 -m64 -fdump-tree-ivopts-details" } */ + +/* The test 'if (p2 > p_limit2)' can be replaced, so iv p2 can be + * eliminated. */ +long foo(long* p, long* p2, int N1, int N2) +{ + int i = 0; + long* p_limit = p + N1; + long* p_limit2 = p2 + N2; + long s = 0; + while (p <= p_limit) + { + p++; + p2++; + if (p2 > p_limit2) + break; + s += (*p); + } + return s; +} + +/* { dg-final { scan-tree-dump-times "Replacing" 1 "ivopts"} } */ +/* { dg-final { cleanup-tree-dump "ivopts" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ivopt_mult_2.c b/gcc/testsuite/gcc.dg/tree-ssa/ivopt_mult_2.c new file mode 100644 index 00000000000..5e38df67be4 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ivopt_mult_2.c @@ -0,0 +1,25 @@ +/* { dg-do compile { target {{ i?86-*-* x86_64-*-* } && lp64 } } } */ +/* { dg-options "-O2 -m64 -fdump-tree-ivopts-details" } */ + +/* Exit tests 'i < N1' and 'p2 > p_limit2' can be replaced, so + * two ivs i and p2 can be eliminate. */ +long foo(long* p, long* p2, int N1, int N2) +{ + int i = 0; + long* p_limit2 = p2 + N2; + long s = 0; + while (i < N1) + { + p++; + p2++; + i++; + if (p2 > p_limit2) + break; + s += (*p); + } + + return s; +} + +/* { dg-final { scan-tree-dump-times "Replacing" 2 "ivopts"} } */ +/* { dg-final { cleanup-tree-dump "ivopts" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ivopt_mult_3.c b/gcc/testsuite/gcc.dg/tree-ssa/ivopt_mult_3.c new file mode 100644 index 00000000000..dc78a43f73f --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ivopt_mult_3.c @@ -0,0 +1,22 @@ +/* { dg-do compile { target {{ i?86-*-* x86_64-*-* } && lp64 } } } */ +/* { dg-options "-O2 -m64 -fdump-tree-ivopts-details" } */ + +/* iv p2 can be eliminated. */ +long foo(long* p, long* p2, int N1, int N2) +{ + unsigned long i = 0; + long* p_limit2 = p2 + N2; + long s = 0; + while (i < N1) + { + p2++; + i++; + if (p2 > p_limit2) + break; + s += p[i]; + } + return s; +} + +/* { dg-final { scan-tree-dump-times "Replacing" 1 "ivopts"} } */ +/* { dg-final { cleanup-tree-dump "ivopts" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ivopt_mult_4.c b/gcc/testsuite/gcc.dg/tree-ssa/ivopt_mult_4.c new file mode 100644 index 00000000000..d2aa78d61e3 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ivopt_mult_4.c @@ -0,0 +1,25 @@ + +/* { dg-do compile { target {{ i?86-*-* x86_64-*-* } && lp64 } } } */ +/* { dg-options "-O2 -m64 -fdump-tree-ivopts-details" } */ + +/* iv i's step 16 so its period is smaller than the max iterations + * i.e. replacing if (p2 > p_limit2) with testing of i may result in + * overflow. */ +long foo(long* p, long* p2, int N1, int N2) +{ + unsigned long i = 0; + long* p_limit2 = p2 + N2; + long s = 0; + while (i < N1) + { + p2++; + i += 16; + if (p2 > p_limit2) + break; + s += p[i]; + } + return s; +} + +/* { dg-final { scan-tree-dump-times "Replacing" 0 "ivopts"} } */ +/* { dg-final { cleanup-tree-dump "ivopts" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-3.c b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-3.c index 7c4236b1cc7..202ad1f4e2f 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-3.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-3.c @@ -8,5 +8,5 @@ void main (void) f2 (); } -/* { dg-final { scan-tree-dump-times "!= 0" 4 "ivopts" } } */ +/* { dg-final { scan-tree-dump-times "!= 0" 5 "ivopts" } } */ /* { dg-final { cleanup-tree-dump "ivopts" } } */ diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c index 478a0190250..519f66e51d1 100644 --- a/gcc/tree-ssa-loop-ivopts.c +++ b/gcc/tree-ssa-loop-ivopts.c @@ -89,6 +89,7 @@ along with GCC; see the file COPYING3. If not see #include "langhooks.h" #include "tree-affine.h" #include "target.h" +#include "tree-inline.h" #include "tree-ssa-propagate.h" /* FIXME: Expressions are expanded to RTL in this pass to determine the @@ -99,10 +100,21 @@ along with GCC; see the file COPYING3. If not see /* The infinite cost. */ #define INFTY 10000000 -/* The expected number of loop iterations. TODO -- use profiling instead of - this. */ #define AVG_LOOP_NITER(LOOP) 5 +/* Returns the expected number of loop iterations for LOOP. + The average trip count is computed from profile data if it + exists. */ + +static inline HOST_WIDE_INT +avg_loop_niter (struct loop *loop) +{ + HOST_WIDE_INT niter = estimated_loop_iterations_int (loop, false); + if (niter == -1) + return AVG_LOOP_NITER (loop); + + return niter; +} /* Representation of the induction variable. */ struct iv @@ -158,6 +170,7 @@ struct cost_pair tree value; /* For final value elimination, the expression for the final value of the iv. For iv elimination, the new bound to compare with. */ + int inv_expr_id; /* Loop invariant expression id. */ }; /* Use. */ @@ -212,6 +225,14 @@ struct iv_cand biv. */ }; +/* Loop invariant expression hashtable entry. */ +struct iv_inv_expr_ent +{ + tree expr; + int id; + hashval_t hash; +}; + /* The data used by the induction variable optimizations. */ typedef struct iv_use *iv_use_p; @@ -239,6 +260,13 @@ struct ivopts_data /* The array of information for the ssa names. */ struct version_info *version_info; + /* The hashtable of loop invariant expressions created + by ivopt. */ + htab_t inv_expr_tab; + + /* Loop invariant expression id. */ + int inv_expr_id; + /* The bitmap of indices in version_info whose value was changed. */ bitmap relevant; @@ -520,6 +548,19 @@ dump_cand (FILE *file, struct iv_cand *cand) return; } + if (cand->var_before) + { + fprintf (file, " var_before "); + print_generic_expr (file, cand->var_before, TDF_SLIM); + fprintf (file, "\n"); + } + if (cand->var_after) + { + fprintf (file, " var_after "); + print_generic_expr (file, cand->var_after, TDF_SLIM); + fprintf (file, "\n"); + } + switch (cand->pos) { case IP_NORMAL: @@ -776,6 +817,29 @@ niter_for_single_dom_exit (struct ivopts_data *data) return niter_for_exit (data, exit, NULL); } +/* Hash table equality function for expressions. */ + +static int +htab_inv_expr_eq (const void *ent1, const void *ent2) +{ + const struct iv_inv_expr_ent *expr1 = + (const struct iv_inv_expr_ent *)ent1; + const struct iv_inv_expr_ent *expr2 = + (const struct iv_inv_expr_ent *)ent2; + + return operand_equal_p (expr1->expr, expr2->expr, 0); +} + +/* Hash function for loop invariant expressions. */ + +static hashval_t +htab_inv_expr_hash (const void *ent) +{ + const struct iv_inv_expr_ent *expr = + (const struct iv_inv_expr_ent *)ent; + return expr->hash; +} + /* Initializes data structures used by the iv optimization pass, stored in DATA. */ @@ -790,6 +854,9 @@ tree_ssa_iv_optimize_init (struct ivopts_data *data) data->niters = NULL; data->iv_uses = VEC_alloc (iv_use_p, heap, 20); data->iv_candidates = VEC_alloc (iv_cand_p, heap, 20); + data->inv_expr_tab = htab_create (10, htab_inv_expr_hash, + htab_inv_expr_eq, free); + data->inv_expr_id = 0; decl_rtl_to_reset = VEC_alloc (tree, heap, 20); } @@ -1840,7 +1907,7 @@ find_interesting_uses_outside (struct ivopts_data *data, edge exit) phi = gsi_stmt (psi); def = PHI_ARG_DEF_FROM_EDGE (phi, exit); if (is_gimple_reg (def)) - find_interesting_uses_op (data, def); + find_interesting_uses_op (data, def); } } @@ -2157,7 +2224,9 @@ add_candidate_1 (struct ivopts_data *data, continue; if (operand_equal_p (base, cand->iv->base, 0) - && operand_equal_p (step, cand->iv->step, 0)) + && operand_equal_p (step, cand->iv->step, 0) + && (TYPE_PRECISION (TREE_TYPE (base)) + == TYPE_PRECISION (TREE_TYPE (cand->iv->base)))) break; } @@ -2571,7 +2640,8 @@ infinite_cost_p (comp_cost cost) static void set_use_iv_cost (struct ivopts_data *data, struct iv_use *use, struct iv_cand *cand, - comp_cost cost, bitmap depends_on, tree value) + comp_cost cost, bitmap depends_on, tree value, + int inv_expr_id) { unsigned i, s; @@ -2587,6 +2657,7 @@ set_use_iv_cost (struct ivopts_data *data, use->cost_map[cand->id].cost = cost; use->cost_map[cand->id].depends_on = depends_on; use->cost_map[cand->id].value = value; + use->cost_map[cand->id].inv_expr_id = inv_expr_id; return; } @@ -2606,6 +2677,7 @@ found: use->cost_map[i].cost = cost; use->cost_map[i].depends_on = depends_on; use->cost_map[i].value = value; + use->cost_map[i].inv_expr_id = inv_expr_id; } /* Gets cost of (USE, CANDIDATE) pair. */ @@ -2956,7 +3028,7 @@ adjust_setup_cost (struct ivopts_data *data, unsigned cost) if (cost == INFTY) return cost; else if (optimize_loop_for_speed_p (data->current_loop)) - return cost / AVG_LOOP_NITER (data->current_loop); + return cost / avg_loop_niter (data->current_loop); else return cost; } @@ -3171,7 +3243,7 @@ get_address_cost (bool symbol_present, bool var_present, HOST_WIDE_INT i; HOST_WIDE_INT start = BIGGEST_ALIGNMENT / BITS_PER_UNIT; HOST_WIDE_INT rat, off; - int old_cse_not_expected; + int old_cse_not_expected, width; unsigned sym_p, var_p, off_p, rat_p, add_c; rtx seq, addr, base; rtx reg0, reg1; @@ -3180,8 +3252,10 @@ get_address_cost (bool symbol_present, bool var_present, reg1 = gen_raw_REG (address_mode, LAST_VIRTUAL_REGISTER + 1); + width = (GET_MODE_BITSIZE (address_mode) < HOST_BITS_PER_WIDE_INT - 2) + ? GET_MODE_BITSIZE (address_mode) : HOST_BITS_PER_WIDE_INT - 2; addr = gen_rtx_fmt_ee (PLUS, address_mode, reg1, NULL_RTX); - for (i = start; i <= 1 << 20; i <<= 1) + for (i = start; i <= 1ll << width; i <<= 1) { XEXP (addr, 1) = gen_int_mode (i, address_mode); if (!memory_address_addr_space_p (mem_mode, addr, as)) @@ -3190,7 +3264,7 @@ get_address_cost (bool symbol_present, bool var_present, data->max_offset = i == start ? 0 : i >> 1; off = data->max_offset; - for (i = start; i <= 1 << 20; i <<= 1) + for (i = start; i <= 1ll << width; i <<= 1) { XEXP (addr, 1) = gen_int_mode (-i, address_mode); if (!memory_address_addr_space_p (mem_mode, addr, as)) @@ -3201,12 +3275,12 @@ get_address_cost (bool symbol_present, bool var_present, if (dump_file && (dump_flags & TDF_DETAILS)) { fprintf (dump_file, "get_address_cost:\n"); - fprintf (dump_file, " min offset %s %d\n", + fprintf (dump_file, " min offset %s " HOST_WIDE_INT_PRINT_DEC "\n", GET_MODE_NAME (mem_mode), - (int) data->min_offset); - fprintf (dump_file, " max offset %s %d\n", + data->min_offset); + fprintf (dump_file, " max offset %s " HOST_WIDE_INT_PRINT_DEC "\n", GET_MODE_NAME (mem_mode), - (int) data->max_offset); + data->max_offset); } rat = 1; @@ -3717,6 +3791,144 @@ difference_cost (struct ivopts_data *data, return force_var_cost (data, aff_combination_to_tree (&aff_e1), depends_on); } +/* Returns true if AFF1 and AFF2 are identical. */ + +static bool +compare_aff_trees (aff_tree *aff1, aff_tree *aff2) +{ + unsigned i; + + if (aff1->n != aff2->n) + return false; + + for (i = 0; i < aff1->n; i++) + { + if (double_int_cmp (aff1->elts[i].coef, aff2->elts[i].coef, 0) != 0) + return false; + + if (!operand_equal_p (aff1->elts[i].val, aff2->elts[i].val, 0)) + return false; + } + return true; +} + +/* Returns the pseudo expr id if expression UBASE - RATIO * CBASE + requires a new compiler generated temporary. Returns -1 otherwise. + ADDRESS_P is a flag indicating if the expression is for address + computation. */ + +static int +get_loop_invariant_expr_id (struct ivopts_data *data, tree ubase, + tree cbase, HOST_WIDE_INT ratio, + bool address_p) +{ + aff_tree ubase_aff, cbase_aff; + tree expr, ub, cb; + struct iv_inv_expr_ent ent; + struct iv_inv_expr_ent **slot; + + STRIP_NOPS (ubase); + STRIP_NOPS (cbase); + ub = ubase; + cb = cbase; + + if ((TREE_CODE (ubase) == INTEGER_CST) + && (TREE_CODE (cbase) == INTEGER_CST)) + return -1; + + /* Strips the constant part. */ + if (TREE_CODE (ubase) == PLUS_EXPR + || TREE_CODE (ubase) == MINUS_EXPR + || TREE_CODE (ubase) == POINTER_PLUS_EXPR) + { + if (TREE_CODE (TREE_OPERAND (ubase, 1)) == INTEGER_CST) + ubase = TREE_OPERAND (ubase, 0); + } + + /* Strips the constant part. */ + if (TREE_CODE (cbase) == PLUS_EXPR + || TREE_CODE (cbase) == MINUS_EXPR + || TREE_CODE (cbase) == POINTER_PLUS_EXPR) + { + if (TREE_CODE (TREE_OPERAND (cbase, 1)) == INTEGER_CST) + cbase = TREE_OPERAND (cbase, 0); + } + + if (address_p) + { + if (((TREE_CODE (ubase) == SSA_NAME) + || (TREE_CODE (ubase) == ADDR_EXPR + && is_gimple_min_invariant (ubase))) + && (TREE_CODE (cbase) == INTEGER_CST)) + return -1; + + if (((TREE_CODE (cbase) == SSA_NAME) + || (TREE_CODE (cbase) == ADDR_EXPR + && is_gimple_min_invariant (cbase))) + && (TREE_CODE (ubase) == INTEGER_CST)) + return -1; + } + + if (ratio == 1) + { + if(operand_equal_p (ubase, cbase, 0)) + return -1; + + if (TREE_CODE (ubase) == ADDR_EXPR + && TREE_CODE (cbase) == ADDR_EXPR) + { + tree usym, csym; + + usym = TREE_OPERAND (ubase, 0); + csym = TREE_OPERAND (cbase, 0); + if (TREE_CODE (usym) == ARRAY_REF) + { + tree ind = TREE_OPERAND (usym, 1); + if (TREE_CODE (ind) == INTEGER_CST + && host_integerp (ind, 0) + && TREE_INT_CST_LOW (ind) == 0) + usym = TREE_OPERAND (usym, 0); + } + if (TREE_CODE (csym) == ARRAY_REF) + { + tree ind = TREE_OPERAND (csym, 1); + if (TREE_CODE (ind) == INTEGER_CST + && host_integerp (ind, 0) + && TREE_INT_CST_LOW (ind) == 0) + csym = TREE_OPERAND (csym, 0); + } + if (operand_equal_p (usym, csym, 0)) + return -1; + } + /* Now do more complex comparison */ + tree_to_aff_combination (ubase, TREE_TYPE (ubase), &ubase_aff); + tree_to_aff_combination (cbase, TREE_TYPE (cbase), &cbase_aff); + if (compare_aff_trees (&ubase_aff, &cbase_aff)) + return -1; + } + + tree_to_aff_combination (ub, TREE_TYPE (ub), &ubase_aff); + tree_to_aff_combination (cb, TREE_TYPE (cb), &cbase_aff); + + aff_combination_scale (&cbase_aff, shwi_to_double_int (-1 * ratio)); + aff_combination_add (&ubase_aff, &cbase_aff); + expr = aff_combination_to_tree (&ubase_aff); + ent.expr = expr; + ent.hash = iterative_hash_expr (expr, 0); + slot = (struct iv_inv_expr_ent **) htab_find_slot (data->inv_expr_tab, + &ent, INSERT); + if (*slot) + return (*slot)->id; + + *slot = XNEW (struct iv_inv_expr_ent); + (*slot)->expr = expr; + (*slot)->hash = ent.hash; + (*slot)->id = data->inv_expr_id++; + return (*slot)->id; +} + + + /* Determines the cost of the computation by that USE is expressed from induction variable CAND. If ADDRESS_P is true, we just need to create an address from it, otherwise we want to get it into @@ -3729,7 +3941,8 @@ static comp_cost get_computation_cost_at (struct ivopts_data *data, struct iv_use *use, struct iv_cand *cand, bool address_p, bitmap *depends_on, gimple at, - bool *can_autoinc) + bool *can_autoinc, + int *inv_expr_id) { tree ubase = use->iv->base, ustep = use->iv->step; tree cbase, cstep; @@ -3812,6 +4025,7 @@ get_computation_cost_at (struct ivopts_data *data, ubase, build_int_cst (utype, 0), &symbol_present, &var_present, &offset, depends_on); + cost.cost /= avg_loop_niter (data->current_loop); } else if (ratio == 1) { @@ -3819,6 +4033,7 @@ get_computation_cost_at (struct ivopts_data *data, ubase, cbase, &symbol_present, &var_present, &offset, depends_on); + cost.cost /= avg_loop_niter (data->current_loop); } else if (address_p && !POINTER_TYPE_P (ctype) @@ -3832,16 +4047,27 @@ get_computation_cost_at (struct ivopts_data *data, ubase, cbase, &symbol_present, &var_present, &offset, depends_on); + cost.cost /= avg_loop_niter (data->current_loop); } else { cost = force_var_cost (data, cbase, depends_on); - cost.cost += add_cost (TYPE_MODE (ctype), data->speed); cost = add_costs (cost, difference_cost (data, ubase, build_int_cst (utype, 0), &symbol_present, &var_present, &offset, depends_on)); + cost.cost /= avg_loop_niter (data->current_loop); + cost.cost += add_cost (TYPE_MODE (ctype), data->speed); + } + + if (inv_expr_id) + { + *inv_expr_id = + get_loop_invariant_expr_id (data, ubase, cbase, ratio, address_p); + /* Clear depends on. */ + if (*inv_expr_id != -1 && depends_on && *depends_on) + bitmap_clear (*depends_on); } /* If we are after the increment, the value of the candidate is higher by @@ -3916,11 +4142,12 @@ fallback: static comp_cost get_computation_cost (struct ivopts_data *data, struct iv_use *use, struct iv_cand *cand, - bool address_p, bitmap *depends_on, bool *can_autoinc) + bool address_p, bitmap *depends_on, + bool *can_autoinc, int *inv_expr_id) { return get_computation_cost_at (data, use, cand, address_p, depends_on, use->stmt, - can_autoinc); + can_autoinc, inv_expr_id); } /* Determines cost of basing replacement of USE on CAND in a generic @@ -3932,6 +4159,7 @@ determine_use_iv_cost_generic (struct ivopts_data *data, { bitmap depends_on; comp_cost cost; + int inv_expr_id = -1; /* The simple case first -- if we need to express value of the preserved original biv, the cost is 0. This also prevents us from counting the @@ -3940,12 +4168,15 @@ determine_use_iv_cost_generic (struct ivopts_data *data, if (cand->pos == IP_ORIGINAL && cand->incremented_at == use->stmt) { - set_use_iv_cost (data, use, cand, zero_cost, NULL, NULL_TREE); + set_use_iv_cost (data, use, cand, zero_cost, NULL, NULL_TREE, -1); return true; } - cost = get_computation_cost (data, use, cand, false, &depends_on, NULL); - set_use_iv_cost (data, use, cand, cost, depends_on, NULL_TREE); + cost = get_computation_cost (data, use, cand, false, &depends_on, + NULL, &inv_expr_id); + + set_use_iv_cost (data, use, cand, cost, depends_on, NULL_TREE, + inv_expr_id); return !infinite_cost_p (cost); } @@ -3958,8 +4189,9 @@ determine_use_iv_cost_address (struct ivopts_data *data, { bitmap depends_on; bool can_autoinc; + int inv_expr_id = -1; comp_cost cost = get_computation_cost (data, use, cand, true, &depends_on, - &can_autoinc); + &can_autoinc, &inv_expr_id); if (cand->ainc_use == use) { @@ -3971,7 +4203,8 @@ determine_use_iv_cost_address (struct ivopts_data *data, else if (cand->pos == IP_AFTER_USE || cand->pos == IP_BEFORE_USE) cost = infinite_cost; } - set_use_iv_cost (data, use, cand, cost, depends_on, NULL_TREE); + set_use_iv_cost (data, use, cand, cost, depends_on, NULL_TREE, + inv_expr_id); return !infinite_cost_p (cost); } @@ -4151,12 +4384,13 @@ determine_use_iv_cost_condition (struct ivopts_data *data, bitmap depends_on_elim = NULL, depends_on_express = NULL, depends_on; comp_cost elim_cost, express_cost, cost; bool ok; + int inv_expr_id = -1; tree *control_var, *bound_cst; /* Only consider real candidates. */ if (!cand->iv) { - set_use_iv_cost (data, use, cand, infinite_cost, NULL, NULL_TREE); + set_use_iv_cost (data, use, cand, infinite_cost, NULL, NULL_TREE, -1); return false; } @@ -4190,7 +4424,8 @@ determine_use_iv_cost_condition (struct ivopts_data *data, elim_cost.cost -= 1; express_cost = get_computation_cost (data, use, cand, false, - &depends_on_express, NULL); + &depends_on_express, NULL, + &inv_expr_id); fd_ivopts_data = data; walk_tree (&cmp_iv->base, find_depends, &depends_on_express, NULL); @@ -4209,7 +4444,7 @@ determine_use_iv_cost_condition (struct ivopts_data *data, bound = NULL_TREE; } - set_use_iv_cost (data, use, cand, cost, depends_on, bound); + set_use_iv_cost (data, use, cand, cost, depends_on, bound, inv_expr_id); if (depends_on_elim) BITMAP_FREE (depends_on_elim); @@ -4257,7 +4492,7 @@ autoinc_possible_for_pair (struct ivopts_data *data, struct iv_use *use, return false; cost = get_computation_cost (data, use, cand, true, &depends_on, - &can_autoinc); + &can_autoinc, NULL); BITMAP_FREE (depends_on); @@ -4381,6 +4616,8 @@ determine_use_iv_costs (struct ivopts_data *data) if (use->cost_map[j].depends_on) bitmap_print (dump_file, use->cost_map[j].depends_on, "",""); + if (use->cost_map[j].inv_expr_id != -1) + fprintf (dump_file, " inv_expr:%d", use->cost_map[j].inv_expr_id); fprintf (dump_file, "\n"); } @@ -4556,14 +4793,54 @@ cheaper_cost_pair (struct cost_pair *a, struct cost_pair *b) return false; } + +/* Returns candidate by that USE is expressed in IVS. */ + +static struct cost_pair * +iv_ca_cand_for_use (struct iv_ca *ivs, struct iv_use *use) +{ + return ivs->cand_for_use[use->id]; +} + + +/* Returns the number of temps needed for new loop invariant + expressions. */ + +static int +iv_ca_get_num_inv_exprs (struct ivopts_data *data, struct iv_ca *ivs) +{ + unsigned i, n = 0; + unsigned *used_inv_expr = XCNEWVEC (unsigned, data->inv_expr_id + 1); + + for (i = 0; i < ivs->upto; i++) + { + struct iv_use *use = iv_use (data, i); + struct cost_pair *cp = iv_ca_cand_for_use (ivs, use); + if (cp && cp->inv_expr_id != -1) + { + used_inv_expr[cp->inv_expr_id]++; + if (used_inv_expr[cp->inv_expr_id] == 1) + n++; + } + } + + free (used_inv_expr); + return n; +} + /* Computes the cost field of IVS structure. */ static void iv_ca_recount_cost (struct ivopts_data *data, struct iv_ca *ivs) { + unsigned n_inv_exprs = 0; comp_cost cost = ivs->cand_use_cost; + cost.cost += ivs->cand_cost; - cost.cost += ivopts_global_cost_for_size (data, ivs->n_regs); + + n_inv_exprs = iv_ca_get_num_inv_exprs (data, ivs); + cost.cost += ivopts_global_cost_for_size (data, + ivs->n_regs + n_inv_exprs); ivs->cost = cost; } @@ -4583,7 +4860,7 @@ iv_ca_set_remove_invariants (struct iv_ca *ivs, bitmap invs) { ivs->n_invariant_uses[iid]--; if (ivs->n_invariant_uses[iid] == 0) - ivs->n_regs--; + ivs->n_regs--; } } @@ -4638,7 +4915,7 @@ iv_ca_set_add_invariants (struct iv_ca *ivs, bitmap invs) { ivs->n_invariant_uses[iid]++; if (ivs->n_invariant_uses[iid] == 1) - ivs->n_regs++; + ivs->n_regs++; } } @@ -4682,14 +4959,16 @@ iv_ca_set_cp (struct ivopts_data *data, struct iv_ca *ivs, } /* Extend set IVS by expressing USE by some of the candidates in it - if possible. */ + if possible. All important candidates will be considered + if IMPORTANT_CANDIDATES is true. */ static void iv_ca_add_use (struct ivopts_data *data, struct iv_ca *ivs, - struct iv_use *use) + struct iv_use *use, bool important_candidates) { struct cost_pair *best_cp = NULL, *cp; bitmap_iterator bi; + bitmap cands; unsigned i; gcc_assert (ivs->upto >= use->id); @@ -4700,9 +4979,12 @@ iv_ca_add_use (struct ivopts_data *data, struct iv_ca *ivs, ivs->bad_uses++; } - EXECUTE_IF_SET_IN_BITMAP (ivs->cands, 0, i, bi) + cands = (important_candidates ? data->important_candidates : ivs->cands); + EXECUTE_IF_SET_IN_BITMAP (cands, 0, i, bi) { - cp = get_use_iv_cost (data, use, iv_cand (data, i)); + struct iv_cand *cand = iv_cand (data, i); + + cp = get_use_iv_cost (data, use, cand); if (cheaper_cost_pair (cp, best_cp)) best_cp = cp; @@ -4782,14 +5064,6 @@ iv_ca_delta_join (struct iv_ca_delta *l1, struct iv_ca_delta *l2) return l1; } -/* Returns candidate by that USE is expressed in IVS. */ - -static struct cost_pair * -iv_ca_cand_for_use (struct iv_ca *ivs, struct iv_use *use) -{ - return ivs->cand_for_use[use->id]; -} - /* Reverse the list of changes DELTA, forming the inverse to it. */ static struct iv_ca_delta * @@ -4913,8 +5187,21 @@ iv_ca_dump (struct ivopts_data *data, FILE *file, struct iv_ca *ivs) unsigned i; comp_cost cost = iv_ca_cost (ivs); - fprintf (file, " cost %d (complexity %d)\n", cost.cost, cost.complexity); - bitmap_print (file, ivs->cands, " candidates ","\n"); + fprintf (file, " cost: %d (complexity %d)\n", cost.cost, cost.complexity); + fprintf (file, " cand_cost: %d\n cand_use_cost: %d (complexity %d)\n", + ivs->cand_cost, ivs->cand_use_cost.cost, ivs->cand_use_cost.complexity); + bitmap_print (file, ivs->cands, " candidates: ","\n"); + + for (i = 0; i < ivs->upto; i++) + { + struct iv_use *use = iv_use (data, i); + struct cost_pair *cp = iv_ca_cand_for_use (ivs, use); + if (cp) + fprintf (file, " use:%d --> iv_cand:%d, cost=(%d,%d)\n", + use->id, cp->cand->id, cp->cost.cost, cp->cost.complexity); + else + fprintf (file, " use:%d --> ??\n", use->id); + } for (i = 1; i <= data->max_inv_id; i++) if (ivs->n_invariant_uses[i]) @@ -4922,17 +5209,18 @@ iv_ca_dump (struct ivopts_data *data, FILE *file, struct iv_ca *ivs) fprintf (file, "%s%d", pref, i); pref = ", "; } - fprintf (file, "\n"); + fprintf (file, "\n\n"); } /* Try changing candidate in IVS to CAND for each use. Return cost of the new set, and store differences in DELTA. Number of induction variables - in the new set is stored to N_IVS. */ + in the new set is stored to N_IVS. MIN_NCAND is a flag. When it is true + the function will try to find a solution with mimimal iv candidates. */ static comp_cost iv_ca_extend (struct ivopts_data *data, struct iv_ca *ivs, struct iv_cand *cand, struct iv_ca_delta **delta, - unsigned *n_ivs) + unsigned *n_ivs, bool min_ncand) { unsigned i; comp_cost cost; @@ -4953,11 +5241,11 @@ iv_ca_extend (struct ivopts_data *data, struct iv_ca *ivs, if (!new_cp) continue; - if (!iv_ca_has_deps (ivs, new_cp)) + if (!min_ncand && !iv_ca_has_deps (ivs, new_cp)) continue; - if (!cheaper_cost_pair (new_cp, old_cp)) - continue; + if (!min_ncand && !cheaper_cost_pair (new_cp, old_cp)) + continue; *delta = iv_ca_delta_add (use, old_cp, new_cp, *delta); } @@ -5008,8 +5296,9 @@ iv_ca_narrow (struct ivopts_data *data, struct iv_ca *ivs, cp = get_use_iv_cost (data, use, cnd); if (!cp) continue; + if (!iv_ca_has_deps (ivs, cp)) - continue; + continue; if (!cheaper_cost_pair (cp, new_cp)) continue; @@ -5121,10 +5410,18 @@ try_add_cand_for (struct ivopts_data *data, struct iv_ca *ivs, struct iv_ca_delta *best_delta = NULL, *act_delta; struct cost_pair *cp; - iv_ca_add_use (data, ivs, use); + iv_ca_add_use (data, ivs, use, false); best_cost = iv_ca_cost (ivs); cp = iv_ca_cand_for_use (ivs, use); + if (!cp) + { + ivs->upto--; + ivs->bad_uses--; + iv_ca_add_use (data, ivs, use, true); + best_cost = iv_ca_cost (ivs); + cp = iv_ca_cand_for_use (ivs, use); + } if (cp) { best_delta = iv_ca_delta_add (use, NULL, cp, NULL); @@ -5151,14 +5448,15 @@ try_add_cand_for (struct ivopts_data *data, struct iv_ca *ivs, continue; if (iv_ca_cand_used_p (ivs, cand)) - continue; + continue; cp = get_use_iv_cost (data, use, cand); if (!cp) continue; iv_ca_set_cp (data, ivs, use, cp); - act_cost = iv_ca_extend (data, ivs, cand, &act_delta, NULL); + act_cost = iv_ca_extend (data, ivs, cand, &act_delta, NULL, + true); iv_ca_set_no_cp (data, ivs, use); act_delta = iv_ca_delta_add (use, NULL, cp, act_delta); @@ -5196,7 +5494,7 @@ try_add_cand_for (struct ivopts_data *data, struct iv_ca *ivs, act_delta = NULL; iv_ca_set_cp (data, ivs, use, cp); - act_cost = iv_ca_extend (data, ivs, cand, &act_delta, NULL); + act_cost = iv_ca_extend (data, ivs, cand, &act_delta, NULL, true); iv_ca_set_no_cp (data, ivs, use); act_delta = iv_ca_delta_add (use, iv_ca_cand_for_use (ivs, use), cp, act_delta); @@ -5256,7 +5554,7 @@ try_improve_iv_set (struct ivopts_data *data, struct iv_ca *ivs) if (iv_ca_cand_used_p (ivs, cand)) continue; - acost = iv_ca_extend (data, ivs, cand, &act_delta, &n_ivs); + acost = iv_ca_extend (data, ivs, cand, &act_delta, &n_ivs, false); if (!act_delta) continue; @@ -5416,7 +5714,6 @@ create_new_iv (struct ivopts_data *data, struct iv_cand *cand) /* Rewrite the increment so that it uses var_before directly. */ find_interesting_uses_op (data, cand->var_after)->selected = cand; - return; } @@ -5444,8 +5741,18 @@ create_new_ivs (struct ivopts_data *data, struct iv_ca *set) cand = iv_cand (data, i); create_new_iv (data, cand); } -} + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "\nSelected IV set: \n"); + EXECUTE_IF_SET_IN_BITMAP (set->cands, 0, i, bi) + { + cand = iv_cand (data, i); + dump_cand (dump_file, cand); + } + fprintf (dump_file, "\n"); + } +} /* Rewrites USE (definition of iv used in a nonlinear expression) using candidate CAND. */ @@ -5795,6 +6102,11 @@ rewrite_use_compare (struct ivopts_data *data, tree var_type = TREE_TYPE (var); gimple_seq stmts; + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Replacing exit test: "); + print_gimple_stmt (dump_file, use->stmt, 0, TDF_SLIM); + } compare = iv_elimination_compare (data, use); bound = unshare_expr (fold_convert (var_type, bound)); op = force_gimple_operand (bound, &stmts, true, NULL_TREE); @@ -5979,6 +6291,9 @@ free_loop_data (struct ivopts_data *data) SET_DECL_RTL (obj, NULL_RTX); VEC_truncate (tree, decl_rtl_to_reset, 0); + + htab_empty (data->inv_expr_tab); + data->inv_expr_id = 0; } /* Finalizes data structures used by the iv optimization pass. LOOPS is the @@ -5995,6 +6310,7 @@ tree_ssa_iv_optimize_finalize (struct ivopts_data *data) VEC_free (tree, heap, decl_rtl_to_reset); VEC_free (iv_use_p, heap, data->iv_uses); VEC_free (iv_cand_p, heap, data->iv_candidates); + htab_delete (data->inv_expr_tab); } /* Returns true if the loop body BODY includes any function calls. */ -- 2.11.4.GIT