From 6202d4dbe64fab3147f67e6c836249f7e31ddd6c Mon Sep 17 00:00:00 2001
From: hjagasia <hjagasia@138bc75d-0d04-0410-961f-82ee72b054a4>
Date: Tue, 11 Sep 2007 00:13:47 +0000
Subject: [PATCH] rsha Jagasia <harsha.jagasia@amd.com>             Jan Sjodin
 <jan.sjodin@amd.com>

        * tree-vect-analyze.c (vect_analyze_operations): Change
        comparison of loop iterations with threshold to less than
        or equal to instead of less than. Reduce
        min_scalar_loop_bound by one.
        * tree-vect-transform.c (vect_estimate_min_profitable_iters):
        Change prologue and epilogue iterations estimate to vf/2,
        when unknown at compile-time. Change versioning guard
        cost to taken_branch_cost. If peeling for alignment is
        unknown at compile-time, change peel guard costs to one
        taken branch and one not-taken branch per peeled loop.
        If peeling for alignment is known but number of scalar loop
        iterations is unknown at compile-time, change peel guard
        costs to one taken branch per peeled loop. Change the cost
        model equation to consider vector iterations as the loop
        iterations less the prologue and epilogue iterations.
        Change outside vector cost check to less than or equal to
        zero instead of equal to zero.
        (vect_do_peeling_for_loop_bound): Reduce
        min_scalar_loop_bound by one.
        * tree-vectorizer.h: Add TARG_COND_TAKEN_BRANCH_COST and
        TARG_COND_NOT_TAKEN_BRANCH_COST.
        * config/i386/i386.h (processor_costs): Add
        scalar_stmt_cost, scalar_load_cost, scalar_store_cost,
        vec_stmt_cost, vec_to_scalar_cost, scalar_to_vec_cost,
        vec_align_load_cost, vect_unalign_load_cost,
        vec_store_cost, cond_taken_branch_cost,
        cond_not_taken_branch_cost.
        Define macros for x86 costs.
        * config/i386/i386.c:
        (size_cost): Set scalar_stmt_cost, scalar_load_cost,
        scalar_store_cost, vec_stmt_cost, vec_to_scalar_cost,
        scalar_to_vec_cost, vec_align_load_cost,
        vect_unalign_load_cost, vec_store_cost,
        cond_taken_branch_cost, cond_not_taken_branch_cost to one.
        (i386_cost, i486_cost, pentium_cost, pentiumpro_cost,
        geode_cost, k6_cost, athlon_cost, pentium4_cost, nocona_cost,
        core2_cost, generic64_cost, generic32_cost): Set to default
        untuned costs.
        (k8_cost, amdfam10_cost): Costs for vectorization tuned.
        (x86_builtin_vectorization_cost): New.

2007-09-10  Harsha Jagasia <harsha.jagasia@amd.com>

        * gcc.dg/vect/costmodel/i386/costmodel-vect-31.c:
        Change dg-final to expect 1 non-profitable loop and
        3 profitable loops.
        * gcc.dg/vect/costmodel/x86-64/costmodel-vect-31.c:
        Change dg-final to expect 1 non-profitable loop and
        3 profitable loops.
        * gcc.dg/vect/costmodel/x86-64/costmodel-fast-math-vect-pr29925.c:
        Change dg-final to expect 1 profitable loop.
        * gcc.dg/vect/costmodel/i386/costmodel-fast-math-vect-pr29925.c:
        Change dg-final to expect 1 profitable loop.


git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@128353 138bc75d-0d04-0410-961f-82ee72b054a4
---
 gcc/ChangeLog                                      |  44 +++++
 gcc/config/i386/i386.c                             | 218 +++++++++++++++++++--
 gcc/config/i386/i386.h                             |  67 +++++++
 gcc/testsuite/ChangeLog                            |  13 ++
 .../i386/costmodel-fast-math-vect-pr29925.c        |   2 +-
 .../gcc.dg/vect/costmodel/i386/costmodel-vect-31.c |   4 +-
 .../x86_64/costmodel-fast-math-vect-pr29925.c      |   2 +-
 .../vect/costmodel/x86_64/costmodel-vect-31.c      |   4 +-
 gcc/tree-vect-analyze.c                            |   6 +-
 gcc/tree-vect-transform.c                          |  79 ++++----
 gcc/tree-vectorizer.h                              |  11 +-
 11 files changed, 384 insertions(+), 66 deletions(-)

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 9bcfca14751..ab75d14b912 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,47 @@
+2007-09-10  Harsha Jagasia <harsha.jagasia@amd.com>
+            Jan Sjodin <jan.sjodin@amd.com>
+	
+	* tree-vect-analyze.c (vect_analyze_operations): Change
+	comparison of loop iterations with threshold to less than
+	or equal to instead of less than. Reduce
+	min_scalar_loop_bound by one.
+	* tree-vect-transform.c (vect_estimate_min_profitable_iters): 
+	Change prologue and epilogue iterations estimate to vf/2,
+	when unknown at compile-time. Change versioning guard
+	cost to taken_branch_cost. If peeling for alignment is
+	unknown at compile-time, change peel guard costs to one
+	taken branch and one not-taken branch per peeled loop.
+	If peeling for alignment is known but number of scalar loop
+	iterations is unknown at compile-time, change peel guard
+	costs to one taken branch per peeled loop. Change the cost
+	model equation to consider vector iterations as the loop
+	iterations less the prologue and epilogue iterations.
+	Change outside vector cost check to less than or equal to
+	zero instead of equal to zero.
+	(vect_do_peeling_for_loop_bound): Reduce
+	min_scalar_loop_bound by one.
+	* tree-vectorizer.h: Add TARG_COND_TAKEN_BRANCH_COST and
+	TARG_COND_NOT_TAKEN_BRANCH_COST.	
+	* config/i386/i386.h (processor_costs): Add
+	scalar_stmt_cost, scalar_load_cost, scalar_store_cost,
+	vec_stmt_cost, vec_to_scalar_cost, scalar_to_vec_cost,
+	vec_align_load_cost, vect_unalign_load_cost,
+	vec_store_cost, cond_taken_branch_cost,
+	cond_not_taken_branch_cost.
+	Define macros for x86 costs.
+	* config/i386/i386.c:
+	(size_cost): Set scalar_stmt_cost, scalar_load_cost,
+	scalar_store_cost, vec_stmt_cost, vec_to_scalar_cost,
+	scalar_to_vec_cost, vec_align_load_cost, 
+	vect_unalign_load_cost, vec_store_cost,
+	cond_taken_branch_cost, cond_not_taken_branch_cost to one. 
+	(i386_cost, i486_cost, pentium_cost, pentiumpro_cost,
+	geode_cost, k6_cost, athlon_cost, pentium4_cost, nocona_cost, 
+	core2_cost, generic64_cost, generic32_cost): Set to default
+	untuned costs.
+	(k8_cost, amdfam10_cost): Costs for vectorization tuned.
+	(x86_builtin_vectorization_cost): New.
+
 2007-09-10  Janis Johnson  <janis187@us.ibm.com>
 	    Ben Elliston  <bje@au.ibm.com>
 
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index b1dda95adc7..ce7c19b46ec 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -52,6 +52,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "tm-constrs.h"
 #include "params.h"
 
+static int x86_builtin_vectorization_cost (bool);
+
 #ifndef CHECK_STACK_LIMIT
 #define CHECK_STACK_LIMIT (-1)
 #endif
@@ -126,7 +128,18 @@ struct processor_costs size_cost = {	/* costs for tuning for size */
   {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
    {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
   {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
-   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}}
+   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  1,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  1,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 /* Processor costs (relative to an add) */
@@ -187,6 +200,17 @@ struct processor_costs i386_cost = {	/* 386 specific costs */
    DUMMY_STRINGOP_ALGS},
   {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
    DUMMY_STRINGOP_ALGS},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 static const
@@ -247,7 +271,18 @@ struct processor_costs i486_cost = {	/* 486 specific costs */
   {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
    DUMMY_STRINGOP_ALGS},
   {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
-   DUMMY_STRINGOP_ALGS}
+   DUMMY_STRINGOP_ALGS},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 static const
@@ -306,7 +341,18 @@ struct processor_costs pentium_cost = {
   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
    DUMMY_STRINGOP_ALGS},
   {{libcall, {{-1, rep_prefix_4_byte}}},
-   DUMMY_STRINGOP_ALGS}
+   DUMMY_STRINGOP_ALGS},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 static const
@@ -372,7 +418,18 @@ struct processor_costs pentiumpro_cost = {
    DUMMY_STRINGOP_ALGS},
   {{rep_prefix_4_byte, {{1024, unrolled_loop},
   		        {8192, rep_prefix_4_byte}, {-1, libcall}}},
-   DUMMY_STRINGOP_ALGS}
+   DUMMY_STRINGOP_ALGS},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 static const
@@ -432,7 +489,18 @@ struct processor_costs geode_cost = {
   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
    DUMMY_STRINGOP_ALGS},
   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
-   DUMMY_STRINGOP_ALGS}
+   DUMMY_STRINGOP_ALGS},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 static const
@@ -494,7 +562,18 @@ struct processor_costs k6_cost = {
   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
    DUMMY_STRINGOP_ALGS},
   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
-   DUMMY_STRINGOP_ALGS}
+   DUMMY_STRINGOP_ALGS},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 static const
@@ -556,7 +635,18 @@ struct processor_costs athlon_cost = {
   {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
    DUMMY_STRINGOP_ALGS},
   {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
-   DUMMY_STRINGOP_ALGS}
+   DUMMY_STRINGOP_ALGS},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 static const
@@ -624,7 +714,18 @@ struct processor_costs k8_cost = {
    {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
   {{libcall, {{8, loop}, {24, unrolled_loop},
 	      {2048, rep_prefix_4_byte}, {-1, libcall}}},
-   {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
+   {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+  4,                                    /* scalar_stmt_cost.  */
+  2,                                    /* scalar load_cost.  */
+  2,                                    /* scalar_store_cost.  */
+  5,                                    /* vec_stmt_cost.  */
+  0,                                    /* vec_to_scalar_cost.  */
+  2,                                    /* scalar_to_vec_cost.  */
+  2,                                    /* vec_align_load_cost.  */
+  3,                                    /* vec_unalign_load_cost.  */
+  3,                                    /* vec_store_cost.  */
+  6,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 struct processor_costs amdfam10_cost = {
@@ -700,7 +801,18 @@ struct processor_costs amdfam10_cost = {
    {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
   {{libcall, {{8, loop}, {24, unrolled_loop},
 	      {2048, rep_prefix_4_byte}, {-1, libcall}}},
-   {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
+   {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+  4,                                    /* scalar_stmt_cost.  */
+  2,                                    /* scalar load_cost.  */
+  2,                                    /* scalar_store_cost.  */
+  6,                                    /* vec_stmt_cost.  */
+  0,                                    /* vec_to_scalar_cost.  */
+  2,                                    /* scalar_to_vec_cost.  */
+  2,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  2,                                    /* vec_store_cost.  */
+  6,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 static const
@@ -761,6 +873,17 @@ struct processor_costs pentium4_cost = {
   {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
    {-1, libcall}}},
    DUMMY_STRINGOP_ALGS},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 static const
@@ -822,7 +945,18 @@ struct processor_costs nocona_cost = {
   {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
    {-1, libcall}}},
    {libcall, {{24, loop}, {64, unrolled_loop},
-	      {8192, rep_prefix_8_byte}, {-1, libcall}}}}
+	      {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 static const
@@ -883,7 +1017,18 @@ struct processor_costs core2_cost = {
   {{libcall, {{8, loop}, {15, unrolled_loop},
 	      {2048, rep_prefix_4_byte}, {-1, libcall}}},
    {libcall, {{24, loop}, {32, unrolled_loop},
-	      {8192, rep_prefix_8_byte}, {-1, libcall}}}}
+	      {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 /* Generic64 should produce code tuned for Nocona and K8.  */
@@ -949,7 +1094,18 @@ struct processor_costs generic64_cost = {
   {DUMMY_STRINGOP_ALGS,
    {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
   {DUMMY_STRINGOP_ALGS,
-   {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
+   {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 /* Generic32 should produce code tuned for Athlon, PPro, Pentium4, Nocona and K8.  */
@@ -1010,6 +1166,17 @@ struct processor_costs generic32_cost = {
    DUMMY_STRINGOP_ALGS},
   {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
    DUMMY_STRINGOP_ALGS},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 const struct processor_costs *ix86_cost = &pentium_cost;
@@ -23615,6 +23782,30 @@ static const struct attribute_spec ix86_attribute_table[] =
   { NULL,        0, 0, false, false, false, NULL }
 };
 
+/* Implement targetm.vectorize.builtin_vectorization_cost.  */
+static int 
+x86_builtin_vectorization_cost (bool runtime_test)
+{
+  /* If the branch of the runtime test is taken - i.e. - the vectorized
+     version is skipped - this incurs a misprediction cost (because the
+     vectorized version is expected to be the fall-through).  So we subtract
+     the latency of a mispredicted branch from the costs that are incured
+     when the vectorized version is executed.
+
+     TODO: The values in individual target tables have to be tuned or new
+     fields may be needed. For eg. on K8, the default branch path is the
+     not-taken path. If the taken path is predicted correctly, the minimum
+     penalty of going down the taken-path is 1 cycle. If the taken-path is
+     not predicted correctly, then the minimum penalty is 10 cycles.  */
+
+  if (runtime_test)
+    {
+      return (-(ix86_cost->cond_taken_branch_cost));
+    }
+  else
+    return 0;
+}
+
 /* Initialize the GCC target structure.  */
 #undef TARGET_ATTRIBUTE_TABLE
 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
@@ -23791,6 +23982,9 @@ static const struct attribute_spec ix86_attribute_table[] =
 #undef TARGET_FUNCTION_VALUE
 #define TARGET_FUNCTION_VALUE ix86_function_value
 
+#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
+#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST x86_builtin_vectorization_cost
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-i386.h"
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index e18e90f600c..a14c74b101d 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -138,6 +138,22 @@ struct processor_costs {
 				/* Specify what algorithm
 				   to use for stringops on unknown size.  */
   struct stringop_algs memcpy[2], memset[2];
+  const int scalar_stmt_cost;   /* Cost of any scalar operation, excluding
+				   load and store.  */
+  const int scalar_load_cost;   /* Cost of scalar load.  */
+  const int scalar_store_cost;  /* Cost of scalar store.  */
+  const int vec_stmt_cost;      /* Cost of any vector operation, excluding
+                                   load, store, vector-to-scalar and
+                                   scalar-to-vector operation.  */
+  const int vec_to_scalar_cost;    /* Cost of vect-to-scalar operation.  */
+  const int scalar_to_vec_cost;    /* Cost of scalar-to-vector operation.  */
+  const int vec_align_load_cost;   /* Cost of aligned vector load.  */ 
+  const int vec_unalign_load_cost; /* Cost of unaligned vector load.  */
+  const int vec_store_cost;        /* Cost of vector store.  */
+  const int cond_taken_branch_cost;    /* Cost of taken branch for vectorizer
+					  cost model.  */
+  const int cond_not_taken_branch_cost;/* Cost of not taken branch for
+					  vectorizer cost model.  */
 };
 
 extern const struct processor_costs *ix86_cost;
@@ -2460,6 +2476,57 @@ struct machine_function GTY(())
 #define SYMBOL_REF_DLLEXPORT_P(X) \
 	((SYMBOL_REF_FLAGS (X) & SYMBOL_FLAG_DLLEXPORT) != 0)
 
+/* Model costs for vectorizer.  */
+
+/* Cost of conditional branch.  */
+#undef TARG_COND_BRANCH_COST
+#define TARG_COND_BRANCH_COST           ix86_cost->branch_cost
+
+/* Cost of any scalar operation, excluding load and store.  */
+#undef TARG_SCALAR_STMT_COST
+#define TARG_SCALAR_STMT_COST           ix86_cost->scalar_stmt_cost
+
+/* Cost of scalar load.  */
+#undef TARG_SCALAR_LOAD_COST
+#define TARG_SCALAR_LOAD_COST           ix86_cost->scalar_load_cost
+
+/* Cost of scalar store.  */
+#undef TARG_SCALAR_STORE_COST
+#define TARG_SCALAR_STORE_COST          ix86_cost->scalar_store_cost
+
+/* Cost of any vector operation, excluding load, store or vector to scalar
+   operation.  */ 
+#undef TARG_VEC_STMT_COST
+#define TARG_VEC_STMT_COST              ix86_cost->vec_stmt_cost
+
+/* Cost of vector to scalar operation.  */
+#undef TARG_VEC_TO_SCALAR_COST
+#define TARG_VEC_TO_SCALAR_COST         ix86_cost->vec_to_scalar_cost
+
+/* Cost of scalar to vector operation.  */
+#undef TARG_SCALAR_TO_VEC_COST
+#define TARG_SCALAR_TO_VEC_COST         ix86_cost->scalar_to_vec_cost
+
+/* Cost of aligned vector load.  */
+#undef TARG_VEC_LOAD_COST
+#define TARG_VEC_LOAD_COST              ix86_cost->vec_align_load_cost
+
+/* Cost of misaligned vector load.  */
+#undef TARG_VEC_UNALIGNED_LOAD_COST
+#define TARG_VEC_UNALIGNED_LOAD_COST    ix86_cost->vec_unalign_load_cost
+
+/* Cost of vector store.  */
+#undef TARG_VEC_STORE_COST
+#define TARG_VEC_STORE_COST             ix86_cost->vec_store_cost
+
+/* Cost of conditional taken branch for vectorizer cost model.  */
+#undef TARG_COND_TAKEN_BRANCH_COST
+#define TARG_COND_TAKEN_BRANCH_COST     ix86_cost->cond_taken_branch_cost
+
+/* Cost of conditional not taken branch for vectorizer cost model.  */
+#undef TARG_COND_NOT_TAKEN_BRANCH_COST
+#define TARG_COND_NOT_TAKEN_BRANCH_COST ix86_cost->cond_not_taken_branch_cost
+
 /*
 Local variables:
 version-control: t
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 792984e9ae3..aca143371ff 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,16 @@
+2007-09-10  Harsha Jagasia <harsha.jagasia@amd.com>
+
+        * gcc.dg/vect/costmodel/i386/costmodel-vect-31.c: 
+	Change dg-final to expect 1 non-profitable loop and
+	3 profitable loops.
+        * gcc.dg/vect/costmodel/x86-64/costmodel-vect-31.c:
+	Change dg-final to expect 1 non-profitable loop and
+	3 profitable loops.
+        * gcc.dg/vect/costmodel/x86-64/costmodel-fast-math-vect-pr29925.c:
+	Change dg-final to expect 1 profitable loop.
+        * gcc.dg/vect/costmodel/i386/costmodel-fast-math-vect-pr29925.c:
+	Change dg-final to expect 1 profitable loop.	
+	
 2007-09-10  Richard Sandiford  <richard@codesourcery.com>
 
 	* gcc.target/mips/call-saved-1.c: New test.
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/i386/costmodel-fast-math-vect-pr29925.c b/gcc/testsuite/gcc.dg/vect/costmodel/i386/costmodel-fast-math-vect-pr29925.c
index 5d84017dd24..2766ced2871 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/i386/costmodel-fast-math-vect-pr29925.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/i386/costmodel-fast-math-vect-pr29925.c
@@ -35,6 +35,6 @@ int main()
    return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorization not profitable" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/i386/costmodel-vect-31.c b/gcc/testsuite/gcc.dg/vect/costmodel/i386/costmodel-vect-31.c
index 11e8a29eada..efab30d4ac6 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/i386/costmodel-vect-31.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/i386/costmodel-vect-31.c
@@ -85,7 +85,7 @@ int main (void)
   return main1 ();
 } 
 
-/* { dg-final { scan-tree-dump-times "vectorization not profitable" 2 "vect" } }
+/* { dg-final { scan-tree-dump-times "vectorization not profitable" 1 "vect" } }
  */
-/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-fast-math-vect-pr29925.c b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-fast-math-vect-pr29925.c
index 9a9f97e802b..0bc09f6da32 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-fast-math-vect-pr29925.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-fast-math-vect-pr29925.c
@@ -35,6 +35,6 @@ int main()
    return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorization not profitable" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
 
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-vect-31.c b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-vect-31.c
index 11e8a29eada..efab30d4ac6 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-vect-31.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-vect-31.c
@@ -85,7 +85,7 @@ int main (void)
   return main1 ();
 } 
 
-/* { dg-final { scan-tree-dump-times "vectorization not profitable" 2 "vect" } }
+/* { dg-final { scan-tree-dump-times "vectorization not profitable" 1 "vect" } }
  */
-/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/tree-vect-analyze.c b/gcc/tree-vect-analyze.c
index 684d12dfcb4..557bf2fb470 100644
--- a/gcc/tree-vect-analyze.c
+++ b/gcc/tree-vect-analyze.c
@@ -596,8 +596,8 @@ vect_analyze_operations (loop_vec_info loop_vinfo)
       return false;
     }
 
-  min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND))
-                          * vectorization_factor;
+  min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
+			    * vectorization_factor) - 1);
 
   /* Use the cost model only if it is more conservative than user specified
      threshold.  */
@@ -609,7 +609,7 @@ vect_analyze_operations (loop_vec_info loop_vinfo)
     th = (unsigned) min_profitable_iters;
 
   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-      && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
+      && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
     {
       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))	      
         fprintf (vect_dump, "not vectorized: vectorization not "
diff --git a/gcc/tree-vect-transform.c b/gcc/tree-vect-transform.c
index 30dbf712e55..e2ee92b0d0e 100644
--- a/gcc/tree-vect-transform.c
+++ b/gcc/tree-vect-transform.c
@@ -124,6 +124,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
   int nbbs = loop->num_nodes;
   int byte_misalign;
+  int peel_guard_costs = 0;
   int innerloop_iters = 0, factor;
   VEC (slp_instance, heap) *slp_instances;
   slp_instance instance;
@@ -141,7 +142,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
 
   if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
     {
-      vec_outside_cost += TARG_COND_BRANCH_COST;
+      vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
       if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
                  "versioning.\n");
@@ -188,7 +189,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
      loop.
 
      FORNOW: If we dont know the value of peel_iters for prologue or epilogue
-     at compile-time - we assume it's (vf-1)/2 (the worst would be vf-1).
+     at compile-time - we assume it's vf/2 (the worst would be vf-1).
 
      TODO: Build an expression that represents peel_iters for prologue and
      epilogue to be used in a run-time test.  */
@@ -197,18 +198,26 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
 
   if (byte_misalign < 0)
     {
-      peel_iters_prologue = (vf - 1)/2;
+      peel_iters_prologue = vf/2;
       if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "cost model: "
-                 "prologue peel iters set to (vf-1)/2.");
+                 "prologue peel iters set to vf/2.");
 
       /* If peeling for alignment is unknown, loop bound of main loop becomes
          unknown.  */
-      peel_iters_epilogue = (vf - 1)/2;
+      peel_iters_epilogue = vf/2;
       if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "cost model: "
-                 "epilogue peel iters set to (vf-1)/2 because "
+                 "epilogue peel iters set to vf/2 because "
                  "peeling for alignment is unknown .");
+
+      /* If peeled iterations are unknown, count a taken branch and a not taken
+	 branch per peeled loop. Even if scalar loop iterations are known, 
+	 vector iterations are not known since peeled prologue iterations are
+	 not known. Hence guards remain the same.  */
+      peel_guard_costs +=  2 * (TARG_COND_TAKEN_BRANCH_COST
+			       + TARG_COND_NOT_TAKEN_BRANCH_COST);
+
     }
   else 
     {
@@ -226,11 +235,16 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
 
       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
         {
-          peel_iters_epilogue = (vf - 1)/2;
+          peel_iters_epilogue = vf/2;
           if (vect_print_dump_info (REPORT_DETAILS))
             fprintf (vect_dump, "cost model: "
-                     "epilogue peel iters set to (vf-1)/2 because "
+                     "epilogue peel iters set to vf/2 because "
                      "loop iterations are unknown .");
+
+	  /* If peeled iterations are known but number of scalar loop
+	     iterations are unknown, count a taken branch per peeled loop.  */
+	  peel_guard_costs +=  2 * TARG_COND_TAKEN_BRANCH_COST;
+
         }
       else      
 	{
@@ -241,33 +255,9 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
 	}
     }
 
-  /* Requires a prologue loop when peeling to handle misalignment. Add cost of
-     two guards, one for the peeled loop and one for the vector loop.  */
-
-  if (peel_iters_prologue)
-    {
-      vec_outside_cost += 2 * TARG_COND_BRANCH_COST;
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "cost model: Adding cost of checks for "
-                 "prologue.\n");
-    }
-
- /* Requires an epilogue loop to finish up remaining iterations after vector
-    loop. Add cost of two guards, one for the peeled loop and one for the
-    vector loop.  */
-
-  if (peel_iters_epilogue
-      || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-      || LOOP_VINFO_INT_NITERS (loop_vinfo) % vf)
-    {
-      vec_outside_cost += 2 * TARG_COND_BRANCH_COST;
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "cost model : Adding cost of checks for "
-                 "epilogue.\n");
-    }
-
   vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
-                      + (peel_iters_epilogue * scalar_single_iter_cost);
+                      + (peel_iters_epilogue * scalar_single_iter_cost)
+                      + peel_guard_costs;
 
   /* Allow targets add additional (outside-of-loop) costs. FORNOW, the only
      information we provide for the target is whether testing against the
@@ -305,11 +295,13 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
 
   if ((scalar_single_iter_cost * vf) > vec_inside_cost)
     {
-      if (vec_outside_cost == 0)
+      if (vec_outside_cost <= 0)
         min_profitable_iters = 1;
       else
         {
-          min_profitable_iters = (vec_outside_cost * vf)
+          min_profitable_iters = (vec_outside_cost * vf 
+                                  - vec_inside_cost * peel_iters_prologue
+                                  - vec_inside_cost * peel_iters_epilogue)
                                  / ((scalar_single_iter_cost * vf)
                                     - vec_inside_cost);
 
@@ -344,8 +336,6 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
                peel_iters_epilogue);
       fprintf (vect_dump, "  Calculated minimum iters for profitability: %d\n",
 	       min_profitable_iters);
-      fprintf (vect_dump, "  Actual minimum iters for profitability: %d\n",
-	       min_profitable_iters < vf ? vf : min_profitable_iters);
     }
 
   min_profitable_iters = 
@@ -355,6 +345,11 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
      if (niters <= min_profitable_iters)
        then skip the vectorized loop.  */
   min_profitable_iters--;
+
+  if (vect_print_dump_info (REPORT_DETAILS))
+    fprintf (vect_dump, "  Profitability threshold = %d\n",
+	     min_profitable_iters);
+    
   return min_profitable_iters;
 }
 
@@ -6452,8 +6447,8 @@ vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
 
   /* Analyze cost to set threshhold for vectorized loop.  */
   min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
-  min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND))
-                          * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
+			    * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
 
   /* Use the cost model only if it is more conservative than user specified
      threshold.  */
@@ -6464,8 +6459,8 @@ vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
           || min_profitable_iters > min_scalar_loop_bound))
     th = (unsigned) min_profitable_iters;
 
-  if (min_profitable_iters
-      && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+  if (((LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
+      || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
       && vect_print_dump_info (REPORT_DETAILS))
     fprintf (vect_dump, "vectorization may not be profitable.");
 
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 913c524137b..49ee0452378 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -469,9 +469,14 @@ typedef struct _stmt_vec_info {
 /* These are some defines for the initial implementation of the vectorizer's
    cost model.  These will later be target specific hooks.  */
 
-/* Cost of conditional branch.  */
-#ifndef TARG_COND_BRANCH_COST
-#define TARG_COND_BRANCH_COST        3
+/* Cost of conditional taken branch.  */
+#ifndef TARG_COND_TAKEN_BRANCH_COST
+#define TARG_COND_TAKEN_BRANCH_COST        3
+#endif
+
+/* Cost of conditional not taken branch.  */
+#ifndef TARG_COND_NOT_TAKEN_BRANCH_COST
+#define TARG_COND_NOT_TAKEN_BRANCH_COST        1
 #endif
 
 /* Cost of any scalar operation, excluding load and store.  */
-- 
2.11.4.GIT