gcc-4_6_3-mobile/gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
   3    Free Software Foundation, Inc.
   4    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   5    Ira Rosen <irar@il.ibm.com>
   6
   7 This file is part of GCC.
   8
   9 GCC is free software; you can redistribute it and/or modify it under
  10 the terms of the GNU General Public License as published by the Free
  11 Software Foundation; either version 3, or (at your option) any later
  12 version.
  13
  14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  16 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  17 for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with GCC; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "tm.h"
  27 #include "ggc.h"
  28 #include "tree.h"
  29 #include "basic-block.h"
  30 #include "tree-pretty-print.h"
  31 #include "gimple-pretty-print.h"
  32 #include "tree-flow.h"
  33 #include "tree-dump.h"
  34 #include "cfgloop.h"
  35 #include "cfglayout.h"
  36 #include "expr.h"
  37 #include "recog.h"
  38 #include "optabs.h"
  39 #include "params.h"
  40 #include "diagnostic-core.h"
  41 #include "tree-chrec.h"
  42 #include "tree-scalar-evolution.h"
  43 #include "tree-vectorizer.h"
  44 #include "target.h"
  45
  46 /* Loop Vectorization Pass.
  47
  48    This pass tries to vectorize loops.
  49
  50    For example, the vectorizer transforms the following simple loop:
  51
  52         short a[N]; short b[N]; short c[N]; int i;
  53
  54         for (i=0; i<N; i++){
  55           a[i] = b[i] + c[i];
  56         }
  57
  58    as if it was manually vectorized by rewriting the source code into:
  59
  60         typedef int __attribute__((mode(V8HI))) v8hi;
  61         short a[N];  short b[N]; short c[N];   int i;
  62         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  63         v8hi va, vb, vc;
  64
  65         for (i=0; i<N/8; i++){
  66           vb = pb[i];
  67           vc = pc[i];
  68           va = vb + vc;
  69           pa[i] = va;
  70         }
  71
  72         The main entry to this pass is vectorize_loops(), in which
  73    the vectorizer applies a set of analyses on a given set of loops,
  74    followed by the actual vectorization transformation for the loops that
  75    had successfully passed the analysis phase.
  76         Throughout this pass we make a distinction between two types of
  77    data: scalars (which are represented by SSA_NAMES), and memory references
  78    ("data-refs").  These two types of data require different handling both
  79    during analysis and transformation. The types of data-refs that the
  80    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  81    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  82    accesses are required to have a simple (consecutive) access pattern.
  83
  84    Analysis phase:
  85    ===============
  86         The driver for the analysis phase is vect_analyze_loop().
  87    It applies a set of analyses, some of which rely on the scalar evolution
  88    analyzer (scev) developed by Sebastian Pop.
  89
  90         During the analysis phase the vectorizer records some information
  91    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  92    loop, as well as general information about the loop as a whole, which is
  93    recorded in a "loop_vec_info" struct attached to each loop.
  94
  95    Transformation phase:
  96    =====================
  97         The loop transformation phase scans all the stmts in the loop, and
  98    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
  99    the loop that needs to be vectorized.  It inserts the vector code sequence
 100    just before the scalar stmt S, and records a pointer to the vector code
 101    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 102    attached to S).  This pointer will be used for the vectorization of following
 103    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 104    otherwise, we rely on dead code elimination for removing it.
 105
 106         For example, say stmt S1 was vectorized into stmt VS1:
 107
 108    VS1: vb = px[i];
 109    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 110    S2:  a = b;
 111
 112    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 113    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 114    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 115    resulting sequence would be:
 116
 117    VS1: vb = px[i];
 118    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 119    VS2: va = vb;
 120    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 121
 122         Operands that are not SSA_NAMEs, are data-refs that appear in
 123    load/store operations (like 'x[i]' in S1), and are handled differently.
 124
 125    Target modeling:
 126    =================
 127         Currently the only target specific information that is used is the
 128    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 129    Targets that can support different sizes of vectors, for now will need
 130    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 131    flexibility will be added in the future.
 132
 133         Since we only vectorize operations which vector form can be
 134    expressed using existing tree codes, to verify that an operation is
 135    supported, the vectorizer checks the relevant optab at the relevant
 136    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 137    the value found is CODE_FOR_nothing, then there's no target support, and
 138    we can't vectorize the stmt.
 139
 140    For additional information on this project see:
 141    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 142 */
 143
 144 /* Function vect_determine_vectorization_factor
 145
 146    Determine the vectorization factor (VF).  VF is the number of data elements
 147    that are operated upon in parallel in a single iteration of the vectorized
 148    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 149    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 150    elements can fit in a single vector register.
 151
 152    We currently support vectorization of loops in which all types operated upon
 153    are of the same size.  Therefore this function currently sets VF according to
 154    the size of the types operated upon, and fails if there are multiple sizes
 155    in the loop.
 156
 157    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 158    original loop:
 159         for (i=0; i<N; i++){
 160           a[i] = b[i] + c[i];
 161         }
 162
 163    vectorized loop:
 164         for (i=0; i<N; i+=VF){
 165           a[i:VF] = b[i:VF] + c[i:VF];
 166         }
 167 */
 168
 169 static bool
 170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 171 {
 172   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 173   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 174   int nbbs = loop->num_nodes;
 175   gimple_stmt_iterator si;
 176   unsigned int vectorization_factor = 0;
 177   tree scalar_type;
 178   gimple phi;
 179   tree vectype;
 180   unsigned int nunits;
 181   stmt_vec_info stmt_info;
 182   int i;
 183   HOST_WIDE_INT dummy;
 184
 185   if (vect_print_dump_info (REPORT_DETAILS))
 186     fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
 187
 188   for (i = 0; i < nbbs; i++)
 189     {
 190       basic_block bb = bbs[i];
 191
 192       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 193         {
 194           phi = gsi_stmt (si);
 195           stmt_info = vinfo_for_stmt (phi);
 196           if (vect_print_dump_info (REPORT_DETAILS))
 197             {
 198               fprintf (vect_dump, "==> examining phi: ");
 199               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 200             }
 201
 202           gcc_assert (stmt_info);
 203
 204           if (STMT_VINFO_RELEVANT_P (stmt_info))
 205             {
 206               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 207               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 208
 209               if (vect_print_dump_info (REPORT_DETAILS))
 210                 {
 211                   fprintf (vect_dump, "get vectype for scalar type:  ");
 212                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 213                 }
 214
 215               vectype = get_vectype_for_scalar_type (scalar_type);
 216               if (!vectype)
 217                 {
 218                   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 219                     {
 220                       fprintf (vect_dump,
 221                                "not vectorized: unsupported data-type ");
 222                       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 223                     }
 224                   return false;
 225                 }
 226               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 227
 228               if (vect_print_dump_info (REPORT_DETAILS))
 229                 {
 230                   fprintf (vect_dump, "vectype: ");
 231                   print_generic_expr (vect_dump, vectype, TDF_SLIM);
 232                 }
 233
 234               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 235               if (vect_print_dump_info (REPORT_DETAILS))
 236                 fprintf (vect_dump, "nunits = %d", nunits);
 237
 238               if (!vectorization_factor
 239                   || (nunits > vectorization_factor))
 240                 vectorization_factor = nunits;
 241             }
 242         }
 243
 244       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 245         {
 246           tree vf_vectype;
 247           gimple stmt = gsi_stmt (si);
 248           stmt_info = vinfo_for_stmt (stmt);
 249
 250           if (vect_print_dump_info (REPORT_DETAILS))
 251             {
 252               fprintf (vect_dump, "==> examining statement: ");
 253               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 254             }
 255
 256           gcc_assert (stmt_info);
 257
 258           /* skip stmts which do not need to be vectorized.  */
 259           if (!STMT_VINFO_RELEVANT_P (stmt_info)
 260               && !STMT_VINFO_LIVE_P (stmt_info))
 261             {
 262               if (vect_print_dump_info (REPORT_DETAILS))
 263                 fprintf (vect_dump, "skip.");
 264               continue;
 265             }
 266
 267           if (gimple_get_lhs (stmt) == NULL_TREE)
 268             {
 269               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 270                 {
 271                   fprintf (vect_dump, "not vectorized: irregular stmt.");
 272                   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 273                 }
 274               return false;
 275             }
 276
 277           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 278             {
 279               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 280                 {
 281                   fprintf (vect_dump, "not vectorized: vector stmt in loop:");
 282                   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 283                 }
 284               return false;
 285             }
 286
 287           if (STMT_VINFO_VECTYPE (stmt_info))
 288             {
 289               /* The only case when a vectype had been already set is for stmts
 290                  that contain a dataref, or for "pattern-stmts" (stmts generated
 291                  by the vectorizer to represent/replace a certain idiom).  */
 292               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 293                           || is_pattern_stmt_p (stmt_info));
 294               vectype = STMT_VINFO_VECTYPE (stmt_info);
 295             }
 296           else
 297             {
 298               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
 299                           && !is_pattern_stmt_p (stmt_info));
 300
 301               scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 302               if (vect_print_dump_info (REPORT_DETAILS))
 303                 {
 304                   fprintf (vect_dump, "get vectype for scalar type:  ");
 305                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 306                 }
 307               vectype = get_vectype_for_scalar_type (scalar_type);
 308               if (!vectype)
 309                 {
 310                   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 311                     {
 312                       fprintf (vect_dump,
 313                                "not vectorized: unsupported data-type ");
 314                       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 315                     }
 316                   return false;
 317                 }
 318
 319               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 320             }
 321
 322           /* The vectorization factor is according to the smallest
 323              scalar type (or the largest vector size, but we only
 324              support one vector size per loop).  */
 325           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 326                                                        &dummy);
 327           if (vect_print_dump_info (REPORT_DETAILS))
 328             {
 329               fprintf (vect_dump, "get vectype for scalar type:  ");
 330               print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 331             }
 332           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 333           if (!vf_vectype)
 334             {
 335               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 336                 {
 337                   fprintf (vect_dump,
 338                            "not vectorized: unsupported data-type ");
 339                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 340                 }
 341               return false;
 342             }
 343
 344           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 345                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 346             {
 347               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 348                 {
 349                   fprintf (vect_dump,
 350                            "not vectorized: different sized vector "
 351                            "types in statement, ");
 352                   print_generic_expr (vect_dump, vectype, TDF_SLIM);
 353                   fprintf (vect_dump, " and ");
 354                   print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
 355                 }
 356               return false;
 357             }
 358
 359           if (vect_print_dump_info (REPORT_DETAILS))
 360             {
 361               fprintf (vect_dump, "vectype: ");
 362               print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
 363             }
 364
 365           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 366           if (vect_print_dump_info (REPORT_DETAILS))
 367             fprintf (vect_dump, "nunits = %d", nunits);
 368
 369           if (!vectorization_factor
 370               || (nunits > vectorization_factor))
 371             vectorization_factor = nunits;
 372         }
 373     }
 374
 375   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 376   if (vect_print_dump_info (REPORT_DETAILS))
 377     fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
 378   if (vectorization_factor <= 1)
 379     {
 380       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 381         fprintf (vect_dump, "not vectorized: unsupported data-type");
 382       return false;
 383     }
 384   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 385
 386   return true;
 387 }
 388
 389
 390 /* Function vect_is_simple_iv_evolution.
 391
 392    FORNOW: A simple evolution of an induction variables in the loop is
 393    considered a polynomial evolution with constant step.  */
 394
 395 static bool
 396 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 397                              tree * step)
 398 {
 399   tree init_expr;
 400   tree step_expr;
 401   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 402
 403   /* When there is no evolution in this loop, the evolution function
 404      is not "simple".  */
 405   if (evolution_part == NULL_TREE)
 406     return false;
 407
 408   /* When the evolution is a polynomial of degree >= 2
 409      the evolution function is not "simple".  */
 410   if (tree_is_chrec (evolution_part))
 411     return false;
 412
 413   step_expr = evolution_part;
 414   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 415
 416   if (vect_print_dump_info (REPORT_DETAILS))
 417     {
 418       fprintf (vect_dump, "step: ");
 419       print_generic_expr (vect_dump, step_expr, TDF_SLIM);
 420       fprintf (vect_dump, ",  init: ");
 421       print_generic_expr (vect_dump, init_expr, TDF_SLIM);
 422     }
 423
 424   *init = init_expr;
 425   *step = step_expr;
 426
 427   if (TREE_CODE (step_expr) != INTEGER_CST)
 428     {
 429       if (vect_print_dump_info (REPORT_DETAILS))
 430         fprintf (vect_dump, "step unknown.");
 431       return false;
 432     }
 433
 434   return true;
 435 }
 436
 437 /* Function vect_analyze_scalar_cycles_1.
 438
 439    Examine the cross iteration def-use cycles of scalar variables
 440    in LOOP.  LOOP_VINFO represents the loop that is now being
 441    considered for vectorization (can be LOOP, or an outer-loop
 442    enclosing LOOP).  */
 443
 444 static void
 445 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 446 {
 447   basic_block bb = loop->header;
 448   tree dumy;
 449   VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
 450   gimple_stmt_iterator gsi;
 451   bool double_reduc;
 452
 453   if (vect_print_dump_info (REPORT_DETAILS))
 454     fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
 455
 456   /* First - identify all inductions.  Reduction detection assumes that all the
 457      inductions have been identified, therefore, this order must not be
 458      changed.  */
 459   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 460     {
 461       gimple phi = gsi_stmt (gsi);
 462       tree access_fn = NULL;
 463       tree def = PHI_RESULT (phi);
 464       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 465
 466       if (vect_print_dump_info (REPORT_DETAILS))
 467         {
 468           fprintf (vect_dump, "Analyze phi: ");
 469           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 470         }
 471
 472       /* Skip virtual phi's.  The data dependences that are associated with
 473          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 474       if (!is_gimple_reg (SSA_NAME_VAR (def)))
 475         continue;
 476
 477       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 478
 479       /* Analyze the evolution function.  */
 480       access_fn = analyze_scalar_evolution (loop, def);
 481       if (access_fn)
 482         STRIP_NOPS (access_fn);
 483       if (access_fn && vect_print_dump_info (REPORT_DETAILS))
 484         {
 485           fprintf (vect_dump, "Access function of PHI: ");
 486           print_generic_expr (vect_dump, access_fn, TDF_SLIM);
 487         }
 488
 489       if (!access_fn
 490           || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
 491         {
 492           VEC_safe_push (gimple, heap, worklist, phi);
 493           continue;
 494         }
 495
 496       if (vect_print_dump_info (REPORT_DETAILS))
 497         fprintf (vect_dump, "Detected induction.");
 498       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 499     }
 500
 501
 502   /* Second - identify all reductions and nested cycles.  */
 503   while (VEC_length (gimple, worklist) > 0)
 504     {
 505       gimple phi = VEC_pop (gimple, worklist);
 506       tree def = PHI_RESULT (phi);
 507       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 508       gimple reduc_stmt;
 509       bool nested_cycle;
 510
 511       if (vect_print_dump_info (REPORT_DETAILS))
 512         {
 513           fprintf (vect_dump, "Analyze phi: ");
 514           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 515         }
 516
 517       gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
 518       gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 519
 520       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 521       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 522                                                 &double_reduc);
 523       if (reduc_stmt)
 524         {
 525           if (double_reduc)
 526             {
 527               if (vect_print_dump_info (REPORT_DETAILS))
 528                 fprintf (vect_dump, "Detected double reduction.");
 529
 530               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 531               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 532                                                     vect_double_reduction_def;
 533             }
 534           else
 535             {
 536               if (nested_cycle)
 537                 {
 538                   if (vect_print_dump_info (REPORT_DETAILS))
 539                     fprintf (vect_dump, "Detected vectorizable nested cycle.");
 540
 541                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 542                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 543                                                              vect_nested_cycle;
 544                 }
 545               else
 546                 {
 547                   if (vect_print_dump_info (REPORT_DETAILS))
 548                     fprintf (vect_dump, "Detected reduction.");
 549
 550                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 551                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 552                                                            vect_reduction_def;
 553                   /* Store the reduction cycles for possible vectorization in
 554                      loop-aware SLP.  */
 555                   VEC_safe_push (gimple, heap,
 556                                  LOOP_VINFO_REDUCTIONS (loop_vinfo),
 557                                  reduc_stmt);
 558                 }
 559             }
 560         }
 561       else
 562         if (vect_print_dump_info (REPORT_DETAILS))
 563           fprintf (vect_dump, "Unknown def-use cycle pattern.");
 564     }
 565
 566   VEC_free (gimple, heap, worklist);
 567 }
 568
 569
 570 /* Function vect_analyze_scalar_cycles.
 571
 572    Examine the cross iteration def-use cycles of scalar variables, by
 573    analyzing the loop-header PHIs of scalar variables.  Classify each
 574    cycle as one of the following: invariant, induction, reduction, unknown.
 575    We do that for the loop represented by LOOP_VINFO, and also to its
 576    inner-loop, if exists.
 577    Examples for scalar cycles:
 578
 579    Example1: reduction:
 580
 581               loop1:
 582               for (i=0; i<N; i++)
 583                  sum += a[i];
 584
 585    Example2: induction:
 586
 587               loop2:
 588               for (i=0; i<N; i++)
 589                  a[i] = i;  */
 590
 591 static void
 592 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 593 {
 594   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 595
 596   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 597
 598   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 599      Reductions in such inner-loop therefore have different properties than
 600      the reductions in the nest that gets vectorized:
 601      1. When vectorized, they are executed in the same order as in the original
 602         scalar loop, so we can't change the order of computation when
 603         vectorizing them.
 604      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 605         current checks are too strict.  */
 606
 607   if (loop->inner)
 608     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 609 }
 610
 611 /* Function vect_get_loop_niters.
 612
 613    Determine how many iterations the loop is executed.
 614    If an expression that represents the number of iterations
 615    can be constructed, place it in NUMBER_OF_ITERATIONS.
 616    Return the loop exit condition.  */
 617
 618 static gimple
 619 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 620 {
 621   tree niters;
 622
 623   if (vect_print_dump_info (REPORT_DETAILS))
 624     fprintf (vect_dump, "=== get_loop_niters ===");
 625
 626   niters = number_of_exit_cond_executions (loop);
 627
 628   if (niters != NULL_TREE
 629       && niters != chrec_dont_know)
 630     {
 631       *number_of_iterations = niters;
 632
 633       if (vect_print_dump_info (REPORT_DETAILS))
 634         {
 635           fprintf (vect_dump, "==> get_loop_niters:" );
 636           print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
 637         }
 638     }
 639
 640   return get_loop_exit_condition (loop);
 641 }
 642
 643
 644 /* Function bb_in_loop_p
 645
 646    Used as predicate for dfs order traversal of the loop bbs.  */
 647
 648 static bool
 649 bb_in_loop_p (const_basic_block bb, const void *data)
 650 {
 651   const struct loop *const loop = (const struct loop *)data;
 652   if (flow_bb_inside_loop_p (loop, bb))
 653     return true;
 654   return false;
 655 }
 656
 657
 658 /* Function new_loop_vec_info.
 659
 660    Create and initialize a new loop_vec_info struct for LOOP, as well as
 661    stmt_vec_info structs for all the stmts in LOOP.  */
 662
 663 static loop_vec_info
 664 new_loop_vec_info (struct loop *loop)
 665 {
 666   loop_vec_info res;
 667   basic_block *bbs;
 668   gimple_stmt_iterator si;
 669   unsigned int i, nbbs;
 670
 671   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 672   LOOP_VINFO_LOOP (res) = loop;
 673
 674   bbs = get_loop_body (loop);
 675
 676   /* Create/Update stmt_info for all stmts in the loop.  */
 677   for (i = 0; i < loop->num_nodes; i++)
 678     {
 679       basic_block bb = bbs[i];
 680
 681       /* BBs in a nested inner-loop will have been already processed (because
 682          we will have called vect_analyze_loop_form for any nested inner-loop).
 683          Therefore, for stmts in an inner-loop we just want to update the
 684          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 685          loop_info of the outer-loop we are currently considering to vectorize
 686          (instead of the loop_info of the inner-loop).
 687          For stmts in other BBs we need to create a stmt_info from scratch.  */
 688       if (bb->loop_father != loop)
 689         {
 690           /* Inner-loop bb.  */
 691           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 692           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 693             {
 694               gimple phi = gsi_stmt (si);
 695               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 696               loop_vec_info inner_loop_vinfo =
 697                 STMT_VINFO_LOOP_VINFO (stmt_info);
 698               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 699               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 700             }
 701           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 702            {
 703               gimple stmt = gsi_stmt (si);
 704               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 705               loop_vec_info inner_loop_vinfo =
 706                  STMT_VINFO_LOOP_VINFO (stmt_info);
 707               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 708               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 709            }
 710         }
 711       else
 712         {
 713           /* bb in current nest.  */
 714           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 715             {
 716               gimple phi = gsi_stmt (si);
 717               gimple_set_uid (phi, 0);
 718               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 719             }
 720
 721           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 722             {
 723               gimple stmt = gsi_stmt (si);
 724               gimple_set_uid (stmt, 0);
 725               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 726             }
 727         }
 728     }
 729
 730   /* CHECKME: We want to visit all BBs before their successors (except for
 731      latch blocks, for which this assertion wouldn't hold).  In the simple
 732      case of the loop forms we allow, a dfs order of the BBs would the same
 733      as reversed postorder traversal, so we are safe.  */
 734
 735    free (bbs);
 736    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 737    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 738                               bbs, loop->num_nodes, loop);
 739    gcc_assert (nbbs == loop->num_nodes);
 740
 741   LOOP_VINFO_BBS (res) = bbs;
 742   LOOP_VINFO_NITERS (res) = NULL;
 743   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 744   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 745   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 746   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
 747   LOOP_VINFO_VECT_FACTOR (res) = 0;
 748   LOOP_VINFO_LOOP_NEST (res) = VEC_alloc (loop_p, heap, 3);
 749   LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
 750   LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
 751   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 752   LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
 753     VEC_alloc (gimple, heap,
 754                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 755   LOOP_VINFO_MAY_ALIAS_DDRS (res) =
 756     VEC_alloc (ddr_p, heap,
 757                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 758   LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
 759   LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
 760   LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
 761   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 762   LOOP_VINFO_PEELING_HTAB (res) = NULL;
 763   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 764
 765   return res;
 766 }
 767
 768
 769 /* Function destroy_loop_vec_info.
 770
 771    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 772    stmts in the loop.  */
 773
 774 void
 775 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 776 {
 777   struct loop *loop;
 778   basic_block *bbs;
 779   int nbbs;
 780   gimple_stmt_iterator si;
 781   int j;
 782   VEC (slp_instance, heap) *slp_instances;
 783   slp_instance instance;
 784
 785   if (!loop_vinfo)
 786     return;
 787
 788   loop = LOOP_VINFO_LOOP (loop_vinfo);
 789
 790   bbs = LOOP_VINFO_BBS (loop_vinfo);
 791   nbbs = loop->num_nodes;
 792
 793   if (!clean_stmts)
 794     {
 795       free (LOOP_VINFO_BBS (loop_vinfo));
 796       free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 797       free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 798       VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
 799       VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 800       VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
 801
 802       free (loop_vinfo);
 803       loop->aux = NULL;
 804       return;
 805     }
 806
 807   for (j = 0; j < nbbs; j++)
 808     {
 809       basic_block bb = bbs[j];
 810       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 811         free_stmt_vec_info (gsi_stmt (si));
 812
 813       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 814         {
 815           gimple stmt = gsi_stmt (si);
 816           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 817
 818           if (stmt_info)
 819             {
 820               /* Check if this is a "pattern stmt" (introduced by the
 821                  vectorizer during the pattern recognition pass).  */
 822               bool remove_stmt_p = false;
 823               gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
 824               if (orig_stmt)
 825                 {
 826                   stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
 827                   if (orig_stmt_info
 828                       && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
 829                     remove_stmt_p = true;
 830                 }
 831
 832               /* Free stmt_vec_info.  */
 833               free_stmt_vec_info (stmt);
 834
 835               /* Remove dead "pattern stmts".  */
 836               if (remove_stmt_p)
 837                 gsi_remove (&si, true);
 838             }
 839           gsi_next (&si);
 840         }
 841     }
 842
 843   free (LOOP_VINFO_BBS (loop_vinfo));
 844   free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 845   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 846   VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
 847   VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 848   VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
 849   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
 850   FOR_EACH_VEC_ELT (slp_instance, slp_instances, j, instance)
 851     vect_free_slp_instance (instance);
 852
 853   VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
 854   VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
 855   VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
 856
 857   if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
 858     htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
 859
 860   free (loop_vinfo);
 861   loop->aux = NULL;
 862 }
 863
 864
 865 /* Function vect_analyze_loop_1.
 866
 867    Apply a set of analyses on LOOP, and create a loop_vec_info struct
 868    for it. The different analyses will record information in the
 869    loop_vec_info struct.  This is a subset of the analyses applied in
 870    vect_analyze_loop, to be applied on an inner-loop nested in the loop
 871    that is now considered for (outer-loop) vectorization.  */
 872
 873 static loop_vec_info
 874 vect_analyze_loop_1 (struct loop *loop)
 875 {
 876   loop_vec_info loop_vinfo;
 877
 878   if (vect_print_dump_info (REPORT_DETAILS))
 879     fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
 880
 881   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
 882
 883   loop_vinfo = vect_analyze_loop_form (loop);
 884   if (!loop_vinfo)
 885     {
 886       if (vect_print_dump_info (REPORT_DETAILS))
 887         fprintf (vect_dump, "bad inner-loop form.");
 888       return NULL;
 889     }
 890
 891   return loop_vinfo;
 892 }
 893
 894
 895 /* Function vect_analyze_loop_form.
 896
 897    Verify that certain CFG restrictions hold, including:
 898    - the loop has a pre-header
 899    - the loop has a single entry and exit
 900    - the loop exit condition is simple enough, and the number of iterations
 901      can be analyzed (a countable loop).  */
 902
 903 loop_vec_info
 904 vect_analyze_loop_form (struct loop *loop)
 905 {
 906   loop_vec_info loop_vinfo;
 907   gimple loop_cond;
 908   tree number_of_iterations = NULL;
 909   loop_vec_info inner_loop_vinfo = NULL;
 910
 911   if (vect_print_dump_info (REPORT_DETAILS))
 912     fprintf (vect_dump, "=== vect_analyze_loop_form ===");
 913
 914   /* Different restrictions apply when we are considering an inner-most loop,
 915      vs. an outer (nested) loop.
 916      (FORNOW. May want to relax some of these restrictions in the future).  */
 917
 918   if (!loop->inner)
 919     {
 920       /* Inner-most loop.  We currently require that the number of BBs is
 921          exactly 2 (the header and latch).  Vectorizable inner-most loops
 922          look like this:
 923
 924                         (pre-header)
 925                            |
 926                           header <--------+
 927                            | |            |
 928                            | +--> latch --+
 929                            |
 930                         (exit-bb)  */
 931
 932       if (loop->num_nodes != 2)
 933         {
 934           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 935             fprintf (vect_dump, "not vectorized: control flow in loop.");
 936           return NULL;
 937         }
 938
 939       if (empty_block_p (loop->header))
 940     {
 941           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 942             fprintf (vect_dump, "not vectorized: empty loop.");
 943       return NULL;
 944     }
 945     }
 946   else
 947     {
 948       struct loop *innerloop = loop->inner;
 949       edge entryedge;
 950
 951       /* Nested loop. We currently require that the loop is doubly-nested,
 952          contains a single inner loop, and the number of BBs is exactly 5.
 953          Vectorizable outer-loops look like this:
 954
 955                         (pre-header)
 956                            |
 957                           header <---+
 958                            |         |
 959                           inner-loop |
 960                            |         |
 961                           tail ------+
 962                            |
 963                         (exit-bb)
 964
 965          The inner-loop has the properties expected of inner-most loops
 966          as described above.  */
 967
 968       if ((loop->inner)->inner || (loop->inner)->next)
 969         {
 970           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 971             fprintf (vect_dump, "not vectorized: multiple nested loops.");
 972           return NULL;
 973         }
 974
 975       /* Analyze the inner-loop.  */
 976       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
 977       if (!inner_loop_vinfo)
 978         {
 979           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 980             fprintf (vect_dump, "not vectorized: Bad inner loop.");
 981           return NULL;
 982         }
 983
 984       if (!expr_invariant_in_loop_p (loop,
 985                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
 986         {
 987           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 988             fprintf (vect_dump,
 989                      "not vectorized: inner-loop count not invariant.");
 990           destroy_loop_vec_info (inner_loop_vinfo, true);
 991           return NULL;
 992         }
 993
 994       if (loop->num_nodes != 5)
 995         {
 996           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 997             fprintf (vect_dump, "not vectorized: control flow in loop.");
 998           destroy_loop_vec_info (inner_loop_vinfo, true);
 999           return NULL;
1000         }
1001
1002       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1003       entryedge = EDGE_PRED (innerloop->header, 0);
1004       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1005         entryedge = EDGE_PRED (innerloop->header, 1);
1006
1007       if (entryedge->src != loop->header
1008           || !single_exit (innerloop)
1009           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1010         {
1011           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1012             fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
1013           destroy_loop_vec_info (inner_loop_vinfo, true);
1014           return NULL;
1015         }
1016
1017       if (vect_print_dump_info (REPORT_DETAILS))
1018         fprintf (vect_dump, "Considering outer-loop vectorization.");
1019     }
1020
1021   if (!single_exit (loop)
1022       || EDGE_COUNT (loop->header->preds) != 2)
1023     {
1024       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1025         {
1026           if (!single_exit (loop))
1027             fprintf (vect_dump, "not vectorized: multiple exits.");
1028           else if (EDGE_COUNT (loop->header->preds) != 2)
1029             fprintf (vect_dump, "not vectorized: too many incoming edges.");
1030         }
1031       if (inner_loop_vinfo)
1032         destroy_loop_vec_info (inner_loop_vinfo, true);
1033       return NULL;
1034     }
1035
1036   /* We assume that the loop exit condition is at the end of the loop. i.e,
1037      that the loop is represented as a do-while (with a proper if-guard
1038      before the loop if needed), where the loop header contains all the
1039      executable statements, and the latch is empty.  */
1040   if (!empty_block_p (loop->latch)
1041         || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1042     {
1043       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1044         fprintf (vect_dump, "not vectorized: unexpected loop form.");
1045       if (inner_loop_vinfo)
1046         destroy_loop_vec_info (inner_loop_vinfo, true);
1047       return NULL;
1048     }
1049
1050   /* Make sure there exists a single-predecessor exit bb:  */
1051   if (!single_pred_p (single_exit (loop)->dest))
1052     {
1053       edge e = single_exit (loop);
1054       if (!(e->flags & EDGE_ABNORMAL))
1055         {
1056           split_loop_exit_edge (e);
1057           if (vect_print_dump_info (REPORT_DETAILS))
1058             fprintf (vect_dump, "split exit edge.");
1059         }
1060       else
1061         {
1062           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1063             fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1064           if (inner_loop_vinfo)
1065             destroy_loop_vec_info (inner_loop_vinfo, true);
1066           return NULL;
1067         }
1068     }
1069
1070   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1071   if (!loop_cond)
1072     {
1073       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1074         fprintf (vect_dump, "not vectorized: complicated exit condition.");
1075       if (inner_loop_vinfo)
1076         destroy_loop_vec_info (inner_loop_vinfo, true);
1077       return NULL;
1078     }
1079
1080   if (!number_of_iterations)
1081     {
1082       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1083         fprintf (vect_dump,
1084                  "not vectorized: number of iterations cannot be computed.");
1085       if (inner_loop_vinfo)
1086         destroy_loop_vec_info (inner_loop_vinfo, true);
1087       return NULL;
1088     }
1089
1090   if (chrec_contains_undetermined (number_of_iterations))
1091     {
1092       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1093         fprintf (vect_dump, "Infinite number of iterations.");
1094       if (inner_loop_vinfo)
1095         destroy_loop_vec_info (inner_loop_vinfo, true);
1096       return NULL;
1097     }
1098
1099   if (!NITERS_KNOWN_P (number_of_iterations))
1100     {
1101       if (vect_print_dump_info (REPORT_DETAILS))
1102         {
1103           fprintf (vect_dump, "Symbolic number of iterations is ");
1104           print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1105         }
1106     }
1107   else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1108     {
1109       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1110         fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1111       if (inner_loop_vinfo)
1112         destroy_loop_vec_info (inner_loop_vinfo, false);
1113       return NULL;
1114     }
1115
1116   loop_vinfo = new_loop_vec_info (loop);
1117   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1118   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1119
1120   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1121
1122   /* CHECKME: May want to keep it around it in the future.  */
1123   if (inner_loop_vinfo)
1124     destroy_loop_vec_info (inner_loop_vinfo, false);
1125
1126   gcc_assert (!loop->aux);
1127   loop->aux = loop_vinfo;
1128   return loop_vinfo;
1129 }
1130
1131
1132 /* Get cost by calling cost target builtin.  */
1133
1134 static inline int
1135 vect_get_cost (enum vect_cost_for_stmt type_of_cost)
1136 {
1137   tree dummy_type = NULL;
1138   int dummy = 0;
1139
1140   return targetm.vectorize.builtin_vectorization_cost (type_of_cost,
1141                                                        dummy_type, dummy);
1142 }
1143
1144
1145 /* Function vect_analyze_loop_operations.
1146
1147    Scan the loop stmts and make sure they are all vectorizable.  */
1148
1149 static bool
1150 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1151 {
1152   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1153   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1154   int nbbs = loop->num_nodes;
1155   gimple_stmt_iterator si;
1156   unsigned int vectorization_factor = 0;
1157   int i;
1158   gimple phi;
1159   stmt_vec_info stmt_info;
1160   bool need_to_vectorize = false;
1161   int min_profitable_iters;
1162   int min_scalar_loop_bound;
1163   unsigned int th;
1164   bool only_slp_in_loop = true, ok;
1165
1166   if (vect_print_dump_info (REPORT_DETAILS))
1167     fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1168
1169   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1170   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1171
1172   for (i = 0; i < nbbs; i++)
1173     {
1174       basic_block bb = bbs[i];
1175
1176       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1177         {
1178           phi = gsi_stmt (si);
1179           ok = true;
1180
1181           stmt_info = vinfo_for_stmt (phi);
1182           if (vect_print_dump_info (REPORT_DETAILS))
1183             {
1184               fprintf (vect_dump, "examining phi: ");
1185               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1186             }
1187
1188           if (! is_loop_header_bb_p (bb))
1189             {
1190               /* inner-loop loop-closed exit phi in outer-loop vectorization
1191                  (i.e. a phi in the tail of the outer-loop).
1192                  FORNOW: we currently don't support the case that these phis
1193                  are not used in the outerloop (unless it is double reduction,
1194                  i.e., this phi is vect_reduction_def), cause this case
1195                  requires to actually do something here.  */
1196               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1197                    || STMT_VINFO_LIVE_P (stmt_info))
1198                   && STMT_VINFO_DEF_TYPE (stmt_info)
1199                      != vect_double_reduction_def)
1200                 {
1201                   if (vect_print_dump_info (REPORT_DETAILS))
1202                     fprintf (vect_dump,
1203                              "Unsupported loop-closed phi in outer-loop.");
1204                   return false;
1205                 }
1206               continue;
1207             }
1208
1209           gcc_assert (stmt_info);
1210
1211           if (STMT_VINFO_LIVE_P (stmt_info))
1212             {
1213               /* FORNOW: not yet supported.  */
1214               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1215                 fprintf (vect_dump, "not vectorized: value used after loop.");
1216               return false;
1217             }
1218
1219           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1220               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1221             {
1222               /* A scalar-dependence cycle that we don't support.  */
1223               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1224                 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1225               return false;
1226             }
1227
1228           if (STMT_VINFO_RELEVANT_P (stmt_info))
1229             {
1230               need_to_vectorize = true;
1231               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1232                 ok = vectorizable_induction (phi, NULL, NULL);
1233             }
1234
1235           if (!ok)
1236             {
1237               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1238                 {
1239                   fprintf (vect_dump,
1240                            "not vectorized: relevant phi not supported: ");
1241                   print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1242                 }
1243               return false;
1244             }
1245         }
1246
1247       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1248         {
1249           gimple stmt = gsi_stmt (si);
1250           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1251
1252           gcc_assert (stmt_info);
1253
1254           if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1255             return false;
1256
1257           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1258                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1259               && !PURE_SLP_STMT (stmt_info))
1260             /* STMT needs both SLP and loop-based vectorization.  */
1261             only_slp_in_loop = false;
1262         }
1263     } /* bbs */
1264
1265   /* All operations in the loop are either irrelevant (deal with loop
1266      control, or dead), or only used outside the loop and can be moved
1267      out of the loop (e.g. invariants, inductions).  The loop can be
1268      optimized away by scalar optimizations.  We're better off not
1269      touching this loop.  */
1270   if (!need_to_vectorize)
1271     {
1272       if (vect_print_dump_info (REPORT_DETAILS))
1273         fprintf (vect_dump,
1274                  "All the computation can be taken out of the loop.");
1275       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1276         fprintf (vect_dump,
1277                  "not vectorized: redundant loop. no profit to vectorize.");
1278       return false;
1279     }
1280
1281   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1282      vectorization factor of the loop is the unrolling factor required by the
1283      SLP instances.  If that unrolling factor is 1, we say, that we perform
1284      pure SLP on loop - cross iteration parallelism is not exploited.  */
1285   if (only_slp_in_loop)
1286     vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1287   else
1288     vectorization_factor = least_common_multiple (vectorization_factor,
1289                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1290
1291   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1292
1293   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1294       && vect_print_dump_info (REPORT_DETAILS))
1295     fprintf (vect_dump,
1296         "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1297         vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1298
1299   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1300       && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1301     {
1302       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1303         fprintf (vect_dump, "not vectorized: iteration count too small.");
1304       if (vect_print_dump_info (REPORT_DETAILS))
1305         fprintf (vect_dump,"not vectorized: iteration count smaller than "
1306                  "vectorization factor.");
1307       return false;
1308     }
1309
1310   /* Analyze cost.  Decide if worth while to vectorize.  */
1311
1312   /* Once VF is set, SLP costs should be updated since the number of created
1313      vector stmts depends on VF.  */
1314   vect_update_slp_costs_according_to_vf (loop_vinfo);
1315
1316   min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1317   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1318
1319   if (min_profitable_iters < 0)
1320     {
1321       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1322         fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1323       if (vect_print_dump_info (REPORT_DETAILS))
1324         fprintf (vect_dump, "not vectorized: vector version will never be "
1325                  "profitable.");
1326       return false;
1327     }
1328
1329   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1330                             * vectorization_factor) - 1);
1331
1332   /* Use the cost model only if it is more conservative than user specified
1333      threshold.  */
1334
1335   th = (unsigned) min_scalar_loop_bound;
1336   if (min_profitable_iters
1337       && (!min_scalar_loop_bound
1338           || min_profitable_iters > min_scalar_loop_bound))
1339     th = (unsigned) min_profitable_iters;
1340
1341   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1342       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1343     {
1344       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1345         fprintf (vect_dump, "not vectorized: vectorization not "
1346                  "profitable.");
1347       if (vect_print_dump_info (REPORT_DETAILS))
1348         fprintf (vect_dump, "not vectorized: iteration count smaller than "
1349                  "user specified loop bound parameter or minimum "
1350                  "profitable iterations (whichever is more conservative).");
1351       return false;
1352     }
1353
1354   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1355       || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1356       || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1357     {
1358       if (vect_print_dump_info (REPORT_DETAILS))
1359         fprintf (vect_dump, "epilog loop required.");
1360       if (!vect_can_advance_ivs_p (loop_vinfo))
1361         {
1362           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1363             fprintf (vect_dump,
1364                      "not vectorized: can't create epilog loop 1.");
1365           return false;
1366         }
1367       if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1368         {
1369           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1370             fprintf (vect_dump,
1371                      "not vectorized: can't create epilog loop 2.");
1372           return false;
1373         }
1374     }
1375
1376   return true;
1377 }
1378
1379
1380 /* Function vect_analyze_loop_2.
1381
1382    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1383    for it.  The different analyses will record information in the
1384    loop_vec_info struct.  */
1385 static bool
1386 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1387 {
1388   bool ok, dummy;
1389   int max_vf = MAX_VECTORIZATION_FACTOR;
1390   int min_vf = 2;
1391
1392   /* Find all data references in the loop (which correspond to vdefs/vuses)
1393      and analyze their evolution in the loop.  Also adjust the minimal
1394      vectorization factor according to the loads and stores.
1395
1396      FORNOW: Handle only simple, array references, which
1397      alignment can be forced, and aligned pointer-references.  */
1398
1399   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1400   if (!ok)
1401     {
1402       if (vect_print_dump_info (REPORT_DETAILS))
1403         fprintf (vect_dump, "bad data references.");
1404       return false;
1405     }
1406
1407   /* Classify all cross-iteration scalar data-flow cycles.
1408      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1409
1410   vect_analyze_scalar_cycles (loop_vinfo);
1411
1412   vect_pattern_recog (loop_vinfo);
1413
1414   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1415
1416   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1417   if (!ok)
1418     {
1419       if (vect_print_dump_info (REPORT_DETAILS))
1420         fprintf (vect_dump, "unexpected pattern.");
1421       return false;
1422     }
1423
1424   /* Analyze data dependences between the data-refs in the loop
1425      and adjust the maximum vectorization factor according to
1426      the dependences.
1427      FORNOW: fail at the first data dependence that we encounter.  */
1428
1429   ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf, &dummy);
1430   if (!ok
1431       || max_vf < min_vf)
1432     {
1433       if (vect_print_dump_info (REPORT_DETAILS))
1434         fprintf (vect_dump, "bad data dependence.");
1435       return false;
1436     }
1437
1438   ok = vect_determine_vectorization_factor (loop_vinfo);
1439   if (!ok)
1440     {
1441       if (vect_print_dump_info (REPORT_DETAILS))
1442         fprintf (vect_dump, "can't determine vectorization factor.");
1443       return false;
1444     }
1445   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1446     {
1447       if (vect_print_dump_info (REPORT_DETAILS))
1448         fprintf (vect_dump, "bad data dependence.");
1449       return false;
1450     }
1451
1452   /* Analyze the alignment of the data-refs in the loop.
1453      Fail if a data reference is found that cannot be vectorized.  */
1454
1455   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1456   if (!ok)
1457     {
1458       if (vect_print_dump_info (REPORT_DETAILS))
1459         fprintf (vect_dump, "bad data alignment.");
1460       return false;
1461     }
1462
1463   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1464      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1465
1466   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1467   if (!ok)
1468     {
1469       if (vect_print_dump_info (REPORT_DETAILS))
1470         fprintf (vect_dump, "bad data access.");
1471       return false;
1472     }
1473
1474   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1475      It is important to call pruning after vect_analyze_data_ref_accesses,
1476      since we use grouping information gathered by interleaving analysis.  */
1477   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1478   if (!ok)
1479     {
1480       if (vect_print_dump_info (REPORT_DETAILS))
1481         fprintf (vect_dump, "too long list of versioning for alias "
1482                             "run-time tests.");
1483       return false;
1484     }
1485
1486   /* This pass will decide on using loop versioning and/or loop peeling in
1487      order to enhance the alignment of data references in the loop.  */
1488
1489   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1490   if (!ok)
1491     {
1492       if (vect_print_dump_info (REPORT_DETAILS))
1493         fprintf (vect_dump, "bad data alignment.");
1494       return false;
1495     }
1496
1497   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1498   ok = vect_analyze_slp (loop_vinfo, NULL);
1499   if (ok)
1500     {
1501       /* Decide which possible SLP instances to SLP.  */
1502       vect_make_slp_decision (loop_vinfo);
1503
1504       /* Find stmts that need to be both vectorized and SLPed.  */
1505       vect_detect_hybrid_slp (loop_vinfo);
1506     }
1507
1508   /* Scan all the operations in the loop and make sure they are
1509      vectorizable.  */
1510
1511   ok = vect_analyze_loop_operations (loop_vinfo);
1512   if (!ok)
1513     {
1514       if (vect_print_dump_info (REPORT_DETAILS))
1515         fprintf (vect_dump, "bad operation or unsupported loop bound.");
1516       return false;
1517     }
1518
1519   return true;
1520 }
1521
1522 /* Function vect_analyze_loop.
1523
1524    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1525    for it.  The different analyses will record information in the
1526    loop_vec_info struct.  */
1527 loop_vec_info
1528 vect_analyze_loop (struct loop *loop)
1529 {
1530   loop_vec_info loop_vinfo;
1531   unsigned int vector_sizes;
1532
1533   /* Autodetect first vector size we try.  */
1534   current_vector_size = 0;
1535   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1536
1537   if (vect_print_dump_info (REPORT_DETAILS))
1538     fprintf (vect_dump, "===== analyze_loop_nest =====");
1539
1540   if (loop_outer (loop)
1541       && loop_vec_info_for_loop (loop_outer (loop))
1542       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1543     {
1544       if (vect_print_dump_info (REPORT_DETAILS))
1545         fprintf (vect_dump, "outer-loop already vectorized.");
1546       return NULL;
1547     }
1548
1549   while (1)
1550     {
1551       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1552       loop_vinfo = vect_analyze_loop_form (loop);
1553       if (!loop_vinfo)
1554         {
1555           if (vect_print_dump_info (REPORT_DETAILS))
1556             fprintf (vect_dump, "bad loop form.");
1557           return NULL;
1558         }
1559
1560       if (vect_analyze_loop_2 (loop_vinfo))
1561         {
1562           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1563
1564           return loop_vinfo;
1565         }
1566
1567       destroy_loop_vec_info (loop_vinfo, true);
1568
1569       vector_sizes &= ~current_vector_size;
1570       if (vector_sizes == 0
1571           || current_vector_size == 0)
1572         return NULL;
1573
1574       /* Try the next biggest vector size.  */
1575       current_vector_size = 1 << floor_log2 (vector_sizes);
1576       if (vect_print_dump_info (REPORT_DETAILS))
1577         fprintf (vect_dump, "***** Re-trying analysis with "
1578                  "vector size %d\n", current_vector_size);
1579     }
1580 }
1581
1582
1583 /* Function reduction_code_for_scalar_code
1584
1585    Input:
1586    CODE - tree_code of a reduction operations.
1587
1588    Output:
1589    REDUC_CODE - the corresponding tree-code to be used to reduce the
1590       vector of partial results into a single scalar result (which
1591       will also reside in a vector) or ERROR_MARK if the operation is
1592       a supported reduction operation, but does not have such tree-code.
1593
1594    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1595
1596 static bool
1597 reduction_code_for_scalar_code (enum tree_code code,
1598                                 enum tree_code *reduc_code)
1599 {
1600   switch (code)
1601     {
1602       case MAX_EXPR:
1603         *reduc_code = REDUC_MAX_EXPR;
1604         return true;
1605
1606       case MIN_EXPR:
1607         *reduc_code = REDUC_MIN_EXPR;
1608         return true;
1609
1610       case PLUS_EXPR:
1611         *reduc_code = REDUC_PLUS_EXPR;
1612         return true;
1613
1614       case MULT_EXPR:
1615       case MINUS_EXPR:
1616       case BIT_IOR_EXPR:
1617       case BIT_XOR_EXPR:
1618       case BIT_AND_EXPR:
1619         *reduc_code = ERROR_MARK;
1620         return true;
1621
1622       default:
1623        return false;
1624     }
1625 }
1626
1627
1628 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1629    STMT is printed with a message MSG. */
1630
1631 static void
1632 report_vect_op (gimple stmt, const char *msg)
1633 {
1634   fprintf (vect_dump, "%s", msg);
1635   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1636 }
1637
1638
1639 /* Function vect_is_simple_reduction_1
1640
1641    (1) Detect a cross-iteration def-use cycle that represents a simple
1642    reduction computation.  We look for the following pattern:
1643
1644    loop_header:
1645      a1 = phi < a0, a2 >
1646      a3 = ...
1647      a2 = operation (a3, a1)
1648
1649    such that:
1650    1. operation is commutative and associative and it is safe to
1651       change the order of the computation (if CHECK_REDUCTION is true)
1652    2. no uses for a2 in the loop (a2 is used out of the loop)
1653    3. no uses of a1 in the loop besides the reduction operation
1654    4. no uses of a1 outside the loop.
1655
1656    Conditions 1,4 are tested here.
1657    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1658
1659    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1660    nested cycles, if CHECK_REDUCTION is false.
1661
1662    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1663    reductions:
1664
1665      a1 = phi < a0, a2 >
1666      inner loop (def of a3)
1667      a2 = phi < a3 >
1668
1669    If MODIFY is true it tries also to rework the code in-place to enable
1670    detection of more reduction patterns.  For the time being we rewrite
1671    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
1672 */
1673
1674 static gimple
1675 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
1676                             bool check_reduction, bool *double_reduc,
1677                             bool modify)
1678 {
1679   struct loop *loop = (gimple_bb (phi))->loop_father;
1680   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1681   edge latch_e = loop_latch_edge (loop);
1682   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1683   gimple def_stmt, def1 = NULL, def2 = NULL;
1684   enum tree_code orig_code, code;
1685   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1686   tree type;
1687   int nloop_uses;
1688   tree name;
1689   imm_use_iterator imm_iter;
1690   use_operand_p use_p;
1691   bool phi_def;
1692
1693   *double_reduc = false;
1694
1695   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1696      otherwise, we assume outer loop vectorization.  */
1697   gcc_assert ((check_reduction && loop == vect_loop)
1698               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1699
1700   name = PHI_RESULT (phi);
1701   nloop_uses = 0;
1702   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1703     {
1704       gimple use_stmt = USE_STMT (use_p);
1705       if (is_gimple_debug (use_stmt))
1706         continue;
1707
1708       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1709         {
1710           if (vect_print_dump_info (REPORT_DETAILS))
1711             fprintf (vect_dump, "intermediate value used outside loop.");
1712
1713           return NULL;
1714         }
1715
1716       if (vinfo_for_stmt (use_stmt)
1717           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1718         nloop_uses++;
1719       if (nloop_uses > 1)
1720         {
1721           if (vect_print_dump_info (REPORT_DETAILS))
1722             fprintf (vect_dump, "reduction used in loop.");
1723           return NULL;
1724         }
1725     }
1726
1727   if (TREE_CODE (loop_arg) != SSA_NAME)
1728     {
1729       if (vect_print_dump_info (REPORT_DETAILS))
1730         {
1731           fprintf (vect_dump, "reduction: not ssa_name: ");
1732           print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
1733         }
1734       return NULL;
1735     }
1736
1737   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
1738   if (!def_stmt)
1739     {
1740       if (vect_print_dump_info (REPORT_DETAILS))
1741         fprintf (vect_dump, "reduction: no def_stmt.");
1742       return NULL;
1743     }
1744
1745   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
1746     {
1747       if (vect_print_dump_info (REPORT_DETAILS))
1748         print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1749       return NULL;
1750     }
1751
1752   if (is_gimple_assign (def_stmt))
1753     {
1754       name = gimple_assign_lhs (def_stmt);
1755       phi_def = false;
1756     }
1757   else
1758     {
1759       name = PHI_RESULT (def_stmt);
1760       phi_def = true;
1761     }
1762
1763   nloop_uses = 0;
1764   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1765     {
1766       gimple use_stmt = USE_STMT (use_p);
1767       if (is_gimple_debug (use_stmt))
1768         continue;
1769       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1770           && vinfo_for_stmt (use_stmt)
1771           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1772         nloop_uses++;
1773       if (nloop_uses > 1)
1774         {
1775           if (vect_print_dump_info (REPORT_DETAILS))
1776             fprintf (vect_dump, "reduction used in loop.");
1777           return NULL;
1778         }
1779     }
1780
1781   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
1782      defined in the inner loop.  */
1783   if (phi_def)
1784     {
1785       op1 = PHI_ARG_DEF (def_stmt, 0);
1786
1787       if (gimple_phi_num_args (def_stmt) != 1
1788           || TREE_CODE (op1) != SSA_NAME)
1789         {
1790           if (vect_print_dump_info (REPORT_DETAILS))
1791             fprintf (vect_dump, "unsupported phi node definition.");
1792
1793           return NULL;
1794         }
1795
1796       def1 = SSA_NAME_DEF_STMT (op1);
1797       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1798           && loop->inner
1799           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
1800           && is_gimple_assign (def1))
1801         {
1802           if (vect_print_dump_info (REPORT_DETAILS))
1803             report_vect_op (def_stmt, "detected double reduction: ");
1804
1805           *double_reduc = true;
1806           return def_stmt;
1807         }
1808
1809       return NULL;
1810     }
1811
1812   code = orig_code = gimple_assign_rhs_code (def_stmt);
1813
1814   /* We can handle "res -= x[i]", which is non-associative by
1815      simply rewriting this into "res += -x[i]".  Avoid changing
1816      gimple instruction for the first simple tests and only do this
1817      if we're allowed to change code at all.  */
1818   if (code == MINUS_EXPR
1819       && modify
1820       && (op1 = gimple_assign_rhs1 (def_stmt))
1821       && TREE_CODE (op1) == SSA_NAME
1822       && SSA_NAME_DEF_STMT (op1) == phi)
1823     code = PLUS_EXPR;
1824
1825   if (check_reduction
1826       && (!commutative_tree_code (code) || !associative_tree_code (code)))
1827     {
1828       if (vect_print_dump_info (REPORT_DETAILS))
1829         report_vect_op (def_stmt, "reduction: not commutative/associative: ");
1830       return NULL;
1831     }
1832
1833   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
1834     {
1835       if (code != COND_EXPR)
1836         {
1837           if (vect_print_dump_info (REPORT_DETAILS))
1838             report_vect_op (def_stmt, "reduction: not binary operation: ");
1839
1840           return NULL;
1841         }
1842
1843       op3 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
1844       if (COMPARISON_CLASS_P (op3))
1845         {
1846           op4 = TREE_OPERAND (op3, 1);
1847           op3 = TREE_OPERAND (op3, 0);
1848         }
1849
1850       op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1);
1851       op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2);
1852
1853       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
1854         {
1855           if (vect_print_dump_info (REPORT_DETAILS))
1856             report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1857
1858           return NULL;
1859         }
1860     }
1861   else
1862     {
1863       op1 = gimple_assign_rhs1 (def_stmt);
1864       op2 = gimple_assign_rhs2 (def_stmt);
1865
1866       if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
1867         {
1868           if (vect_print_dump_info (REPORT_DETAILS))
1869             report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1870
1871           return NULL;
1872         }
1873    }
1874
1875   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
1876   if ((TREE_CODE (op1) == SSA_NAME
1877        && !types_compatible_p (type,TREE_TYPE (op1)))
1878       || (TREE_CODE (op2) == SSA_NAME
1879           && !types_compatible_p (type, TREE_TYPE (op2)))
1880       || (op3 && TREE_CODE (op3) == SSA_NAME
1881           && !types_compatible_p (type, TREE_TYPE (op3)))
1882       || (op4 && TREE_CODE (op4) == SSA_NAME
1883           && !types_compatible_p (type, TREE_TYPE (op4))))
1884     {
1885       if (vect_print_dump_info (REPORT_DETAILS))
1886         {
1887           fprintf (vect_dump, "reduction: multiple types: operation type: ");
1888           print_generic_expr (vect_dump, type, TDF_SLIM);
1889           fprintf (vect_dump, ", operands types: ");
1890           print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
1891           fprintf (vect_dump, ",");
1892           print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
1893           if (op3)
1894             {
1895               fprintf (vect_dump, ",");
1896               print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
1897             }
1898
1899           if (op4)
1900             {
1901               fprintf (vect_dump, ",");
1902               print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
1903             }
1904         }
1905
1906       return NULL;
1907     }
1908
1909   /* Check that it's ok to change the order of the computation.
1910      Generally, when vectorizing a reduction we change the order of the
1911      computation.  This may change the behavior of the program in some
1912      cases, so we need to check that this is ok.  One exception is when
1913      vectorizing an outer-loop: the inner-loop is executed sequentially,
1914      and therefore vectorizing reductions in the inner-loop during
1915      outer-loop vectorization is safe.  */
1916
1917   /* CHECKME: check for !flag_finite_math_only too?  */
1918   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
1919       && check_reduction)
1920     {
1921       /* Changing the order of operations changes the semantics.  */
1922       if (vect_print_dump_info (REPORT_DETAILS))
1923         report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
1924       return NULL;
1925     }
1926   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
1927            && check_reduction)
1928     {
1929       /* Changing the order of operations changes the semantics.  */
1930       if (vect_print_dump_info (REPORT_DETAILS))
1931         report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
1932       return NULL;
1933     }
1934   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
1935     {
1936       /* Changing the order of operations changes the semantics.  */
1937       if (vect_print_dump_info (REPORT_DETAILS))
1938         report_vect_op (def_stmt,
1939                         "reduction: unsafe fixed-point math optimization: ");
1940       return NULL;
1941     }
1942
1943   /* If we detected "res -= x[i]" earlier, rewrite it into
1944      "res += -x[i]" now.  If this turns out to be useless reassoc
1945      will clean it up again.  */
1946   if (orig_code == MINUS_EXPR)
1947     {
1948       tree rhs = gimple_assign_rhs2 (def_stmt);
1949       tree negrhs = make_ssa_name (SSA_NAME_VAR (rhs), NULL);
1950       gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
1951                                                          rhs, NULL);
1952       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
1953       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
1954                                                           loop_info, NULL));
1955       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
1956       gimple_assign_set_rhs2 (def_stmt, negrhs);
1957       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
1958       update_stmt (def_stmt);
1959     }
1960
1961   /* Reduction is safe. We're dealing with one of the following:
1962      1) integer arithmetic and no trapv
1963      2) floating point arithmetic, and special flags permit this optimization
1964      3) nested cycle (i.e., outer loop vectorization).  */
1965   if (TREE_CODE (op1) == SSA_NAME)
1966     def1 = SSA_NAME_DEF_STMT (op1);
1967
1968   if (TREE_CODE (op2) == SSA_NAME)
1969     def2 = SSA_NAME_DEF_STMT (op2);
1970
1971   if (code != COND_EXPR
1972       && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2)))
1973     {
1974       if (vect_print_dump_info (REPORT_DETAILS))
1975         report_vect_op (def_stmt, "reduction: no defs for operands: ");
1976       return NULL;
1977     }
1978
1979   /* Check that one def is the reduction def, defined by PHI,
1980      the other def is either defined in the loop ("vect_internal_def"),
1981      or it's an induction (defined by a loop-header phi-node).  */
1982
1983   if (def2 && def2 == phi
1984       && (code == COND_EXPR
1985           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
1986               && (is_gimple_assign (def1)
1987                   || is_gimple_call (def1)
1988                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1989                       == vect_induction_def
1990                   || (gimple_code (def1) == GIMPLE_PHI
1991                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1992                           == vect_internal_def
1993                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
1994     {
1995       if (vect_print_dump_info (REPORT_DETAILS))
1996         report_vect_op (def_stmt, "detected reduction: ");
1997       return def_stmt;
1998     }
1999   else if (def1 && def1 == phi
2000            && (code == COND_EXPR
2001                || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2002                    && (is_gimple_assign (def2)
2003                        || is_gimple_call (def2)
2004                        || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2005                            == vect_induction_def
2006                        || (gimple_code (def2) == GIMPLE_PHI
2007                            && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2008                                == vect_internal_def
2009                            && !is_loop_header_bb_p (gimple_bb (def2)))))))
2010     {
2011       if (check_reduction)
2012         {
2013           /* Swap operands (just for simplicity - so that the rest of the code
2014              can assume that the reduction variable is always the last (second)
2015              argument).  */
2016           if (vect_print_dump_info (REPORT_DETAILS))
2017             report_vect_op (def_stmt,
2018                             "detected reduction: need to swap operands: ");
2019
2020           swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2021                               gimple_assign_rhs2_ptr (def_stmt));
2022         }
2023       else
2024         {
2025           if (vect_print_dump_info (REPORT_DETAILS))
2026             report_vect_op (def_stmt, "detected reduction: ");
2027         }
2028
2029       return def_stmt;
2030     }
2031   else
2032     {
2033       if (vect_print_dump_info (REPORT_DETAILS))
2034         report_vect_op (def_stmt, "reduction: unknown pattern: ");
2035
2036       return NULL;
2037     }
2038 }
2039
2040 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2041    in-place.  Arguments as there.  */
2042
2043 static gimple
2044 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2045                           bool check_reduction, bool *double_reduc)
2046 {
2047   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2048                                      double_reduc, false);
2049 }
2050
2051 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2052    in-place if it enables detection of more reductions.  Arguments
2053    as there.  */
2054
2055 gimple
2056 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2057                           bool check_reduction, bool *double_reduc)
2058 {
2059   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2060                                      double_reduc, true);
2061 }
2062
2063 /* Calculate the cost of one scalar iteration of the loop.  */
2064 int
2065 vect_get_single_scalar_iteraion_cost (loop_vec_info loop_vinfo)
2066 {
2067   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2068   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2069   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2070   int innerloop_iters, i, stmt_cost;
2071
2072   /* Count statements in scalar loop.  Using this as scalar cost for a single
2073      iteration for now.
2074
2075      TODO: Add outer loop support.
2076
2077      TODO: Consider assigning different costs to different scalar
2078      statements.  */
2079
2080   /* FORNOW.  */
2081   innerloop_iters = 1;
2082   if (loop->inner)
2083     innerloop_iters = 50; /* FIXME */
2084
2085   for (i = 0; i < nbbs; i++)
2086     {
2087       gimple_stmt_iterator si;
2088       basic_block bb = bbs[i];
2089
2090       if (bb->loop_father == loop->inner)
2091         factor = innerloop_iters;
2092       else
2093         factor = 1;
2094
2095       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2096         {
2097           gimple stmt = gsi_stmt (si);
2098           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2099
2100           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2101             continue;
2102
2103           /* Skip stmts that are not vectorized inside the loop.  */
2104           if (stmt_info
2105               && !STMT_VINFO_RELEVANT_P (stmt_info)
2106               && (!STMT_VINFO_LIVE_P (stmt_info)
2107                   || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2108             continue;
2109
2110           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2111             {
2112               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2113                stmt_cost = vect_get_cost (scalar_load);
2114              else
2115                stmt_cost = vect_get_cost (scalar_store);
2116             }
2117           else
2118             stmt_cost = vect_get_cost (scalar_stmt);
2119
2120           scalar_single_iter_cost += stmt_cost * factor;
2121         }
2122     }
2123   return scalar_single_iter_cost;
2124 }
2125
2126 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2127 int
2128 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2129                              int *peel_iters_epilogue,
2130                              int scalar_single_iter_cost)
2131 {
2132   int peel_guard_costs = 0;
2133   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2134
2135   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2136     {
2137       *peel_iters_epilogue = vf/2;
2138       if (vect_print_dump_info (REPORT_COST))
2139         fprintf (vect_dump, "cost model: "
2140                             "epilogue peel iters set to vf/2 because "
2141                             "loop iterations are unknown .");
2142
2143       /* If peeled iterations are known but number of scalar loop
2144          iterations are unknown, count a taken branch per peeled loop.  */
2145       peel_guard_costs =  2 * vect_get_cost (cond_branch_taken);
2146     }
2147   else
2148     {
2149       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2150       peel_iters_prologue = niters < peel_iters_prologue ?
2151                             niters : peel_iters_prologue;
2152       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2153       /* If we need to peel for gaps, but no peeling is required, we have to
2154          peel VF iterations.  */
2155       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2156         *peel_iters_epilogue = vf;
2157     }
2158
2159    return (peel_iters_prologue * scalar_single_iter_cost)
2160             + (*peel_iters_epilogue * scalar_single_iter_cost)
2161            + peel_guard_costs;
2162 }
2163
2164 /* Function vect_estimate_min_profitable_iters
2165
2166    Return the number of iterations required for the vector version of the
2167    loop to be profitable relative to the cost of the scalar version of the
2168    loop.
2169
2170    TODO: Take profile info into account before making vectorization
2171    decisions, if available.  */
2172
2173 int
2174 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
2175 {
2176   int i;
2177   int min_profitable_iters;
2178   int peel_iters_prologue;
2179   int peel_iters_epilogue;
2180   int vec_inside_cost = 0;
2181   int vec_outside_cost = 0;
2182   int scalar_single_iter_cost = 0;
2183   int scalar_outside_cost = 0;
2184   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2185   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2186   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2187   int nbbs = loop->num_nodes;
2188   int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2189   int peel_guard_costs = 0;
2190   int innerloop_iters = 0, factor;
2191   VEC (slp_instance, heap) *slp_instances;
2192   slp_instance instance;
2193
2194   /* Cost model disabled.  */
2195   if (!flag_vect_cost_model)
2196     {
2197       if (vect_print_dump_info (REPORT_COST))
2198         fprintf (vect_dump, "cost model disabled.");
2199       return 0;
2200     }
2201
2202   /* Requires loop versioning tests to handle misalignment.  */
2203   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2204     {
2205       /*  FIXME: Make cost depend on complexity of individual check.  */
2206       vec_outside_cost +=
2207         VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
2208       if (vect_print_dump_info (REPORT_COST))
2209         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2210                  "versioning to treat misalignment.\n");
2211     }
2212
2213   /* Requires loop versioning with alias checks.  */
2214   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2215     {
2216       /*  FIXME: Make cost depend on complexity of individual check.  */
2217       vec_outside_cost +=
2218         VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
2219       if (vect_print_dump_info (REPORT_COST))
2220         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2221                  "versioning aliasing.\n");
2222     }
2223
2224   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2225       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2226     vec_outside_cost += vect_get_cost (cond_branch_taken);
2227
2228   /* Count statements in scalar loop.  Using this as scalar cost for a single
2229      iteration for now.
2230
2231      TODO: Add outer loop support.
2232
2233      TODO: Consider assigning different costs to different scalar
2234      statements.  */
2235
2236   /* FORNOW.  */
2237   if (loop->inner)
2238     innerloop_iters = 50; /* FIXME */
2239
2240   for (i = 0; i < nbbs; i++)
2241     {
2242       gimple_stmt_iterator si;
2243       basic_block bb = bbs[i];
2244
2245       if (bb->loop_father == loop->inner)
2246         factor = innerloop_iters;
2247       else
2248         factor = 1;
2249
2250       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2251         {
2252           gimple stmt = gsi_stmt (si);
2253           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2254           /* Skip stmts that are not vectorized inside the loop.  */
2255           if (!STMT_VINFO_RELEVANT_P (stmt_info)
2256               && (!STMT_VINFO_LIVE_P (stmt_info)
2257                   || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2258             continue;
2259           vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
2260           /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
2261              some of the "outside" costs are generated inside the outer-loop.  */
2262           vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
2263         }
2264     }
2265
2266   scalar_single_iter_cost = vect_get_single_scalar_iteraion_cost (loop_vinfo);
2267
2268   /* Add additional cost for the peeled instructions in prologue and epilogue
2269      loop.
2270
2271      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2272      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2273
2274      TODO: Build an expression that represents peel_iters for prologue and
2275      epilogue to be used in a run-time test.  */
2276
2277   if (npeel  < 0)
2278     {
2279       peel_iters_prologue = vf/2;
2280       if (vect_print_dump_info (REPORT_COST))
2281         fprintf (vect_dump, "cost model: "
2282                  "prologue peel iters set to vf/2.");
2283
2284       /* If peeling for alignment is unknown, loop bound of main loop becomes
2285          unknown.  */
2286       peel_iters_epilogue = vf/2;
2287       if (vect_print_dump_info (REPORT_COST))
2288         fprintf (vect_dump, "cost model: "
2289                  "epilogue peel iters set to vf/2 because "
2290                  "peeling for alignment is unknown .");
2291
2292       /* If peeled iterations are unknown, count a taken branch and a not taken
2293          branch per peeled loop. Even if scalar loop iterations are known,
2294          vector iterations are not known since peeled prologue iterations are
2295          not known. Hence guards remain the same.  */
2296       peel_guard_costs +=  2 * (vect_get_cost (cond_branch_taken)
2297                                 + vect_get_cost (cond_branch_not_taken));
2298       vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2299                            + (peel_iters_epilogue * scalar_single_iter_cost)
2300                            + peel_guard_costs;
2301     }
2302   else
2303     {
2304       peel_iters_prologue = npeel;
2305       vec_outside_cost += vect_get_known_peeling_cost (loop_vinfo,
2306                                     peel_iters_prologue, &peel_iters_epilogue,
2307                                     scalar_single_iter_cost);
2308     }
2309
2310   /* FORNOW: The scalar outside cost is incremented in one of the
2311      following ways:
2312
2313      1. The vectorizer checks for alignment and aliasing and generates
2314      a condition that allows dynamic vectorization.  A cost model
2315      check is ANDED with the versioning condition.  Hence scalar code
2316      path now has the added cost of the versioning check.
2317
2318        if (cost > th & versioning_check)
2319          jmp to vector code
2320
2321      Hence run-time scalar is incremented by not-taken branch cost.
2322
2323      2. The vectorizer then checks if a prologue is required.  If the
2324      cost model check was not done before during versioning, it has to
2325      be done before the prologue check.
2326
2327        if (cost <= th)
2328          prologue = scalar_iters
2329        if (prologue == 0)
2330          jmp to vector code
2331        else
2332          execute prologue
2333        if (prologue == num_iters)
2334          go to exit
2335
2336      Hence the run-time scalar cost is incremented by a taken branch,
2337      plus a not-taken branch, plus a taken branch cost.
2338
2339      3. The vectorizer then checks if an epilogue is required.  If the
2340      cost model check was not done before during prologue check, it
2341      has to be done with the epilogue check.
2342
2343        if (prologue == 0)
2344          jmp to vector code
2345        else
2346          execute prologue
2347        if (prologue == num_iters)
2348          go to exit
2349        vector code:
2350          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2351            jmp to epilogue
2352
2353      Hence the run-time scalar cost should be incremented by 2 taken
2354      branches.
2355
2356      TODO: The back end may reorder the BBS's differently and reverse
2357      conditions/branch directions.  Change the estimates below to
2358      something more reasonable.  */
2359
2360   /* If the number of iterations is known and we do not do versioning, we can
2361      decide whether to vectorize at compile time.  Hence the scalar version
2362      do not carry cost model guard costs.  */
2363   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2364       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2365       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2366     {
2367       /* Cost model check occurs at versioning.  */
2368       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2369           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2370         scalar_outside_cost += vect_get_cost (cond_branch_not_taken);
2371       else
2372         {
2373           /* Cost model check occurs at prologue generation.  */
2374           if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2375             scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken)
2376                                    + vect_get_cost (cond_branch_not_taken);
2377           /* Cost model check occurs at epilogue generation.  */
2378           else
2379             scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken);
2380         }
2381     }
2382
2383   /* Add SLP costs.  */
2384   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2385   FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance)
2386     {
2387       vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2388       vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2389     }
2390
2391   /* Calculate number of iterations required to make the vector version
2392      profitable, relative to the loop bodies only.  The following condition
2393      must hold true:
2394      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2395      where
2396      SIC = scalar iteration cost, VIC = vector iteration cost,
2397      VOC = vector outside cost, VF = vectorization factor,
2398      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2399      SOC = scalar outside cost for run time cost model check.  */
2400
2401   if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2402     {
2403       if (vec_outside_cost <= 0)
2404         min_profitable_iters = 1;
2405       else
2406         {
2407           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2408                                   - vec_inside_cost * peel_iters_prologue
2409                                   - vec_inside_cost * peel_iters_epilogue)
2410                                  / ((scalar_single_iter_cost * vf)
2411                                     - vec_inside_cost);
2412
2413           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2414               <= ((vec_inside_cost * min_profitable_iters)
2415                   + ((vec_outside_cost - scalar_outside_cost) * vf)))
2416             min_profitable_iters++;
2417         }
2418     }
2419   /* vector version will never be profitable.  */
2420   else
2421     {
2422       if (vect_print_dump_info (REPORT_COST))
2423         fprintf (vect_dump, "cost model: the vector iteration cost = %d "
2424                  "divided by the scalar iteration cost = %d "
2425                  "is greater or equal to the vectorization factor = %d.",
2426                  vec_inside_cost, scalar_single_iter_cost, vf);
2427       return -1;
2428     }
2429
2430   if (vect_print_dump_info (REPORT_COST))
2431     {
2432       fprintf (vect_dump, "Cost model analysis: \n");
2433       fprintf (vect_dump, "  Vector inside of loop cost: %d\n",
2434                vec_inside_cost);
2435       fprintf (vect_dump, "  Vector outside of loop cost: %d\n",
2436                vec_outside_cost);
2437       fprintf (vect_dump, "  Scalar iteration cost: %d\n",
2438                scalar_single_iter_cost);
2439       fprintf (vect_dump, "  Scalar outside cost: %d\n", scalar_outside_cost);
2440       fprintf (vect_dump, "  prologue iterations: %d\n",
2441                peel_iters_prologue);
2442       fprintf (vect_dump, "  epilogue iterations: %d\n",
2443                peel_iters_epilogue);
2444       fprintf (vect_dump, "  Calculated minimum iters for profitability: %d\n",
2445                min_profitable_iters);
2446     }
2447
2448   min_profitable_iters =
2449         min_profitable_iters < vf ? vf : min_profitable_iters;
2450
2451   /* Because the condition we create is:
2452      if (niters <= min_profitable_iters)
2453        then skip the vectorized loop.  */
2454   min_profitable_iters--;
2455
2456   if (vect_print_dump_info (REPORT_COST))
2457     fprintf (vect_dump, "  Profitability threshold = %d\n",
2458              min_profitable_iters);
2459
2460   return min_profitable_iters;
2461 }
2462
2463
2464 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2465    functions. Design better to avoid maintenance issues.  */
2466
2467 /* Function vect_model_reduction_cost.
2468
2469    Models cost for a reduction operation, including the vector ops
2470    generated within the strip-mine loop, the initial definition before
2471    the loop, and the epilogue code that must be generated.  */
2472
2473 static bool
2474 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2475                            int ncopies)
2476 {
2477   int outer_cost = 0;
2478   enum tree_code code;
2479   optab optab;
2480   tree vectype;
2481   gimple stmt, orig_stmt;
2482   tree reduction_op;
2483   enum machine_mode mode;
2484   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2485   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2486
2487
2488   /* Cost of reduction op inside loop.  */
2489   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2490     += ncopies * vect_get_cost (vector_stmt);
2491
2492   stmt = STMT_VINFO_STMT (stmt_info);
2493
2494   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2495     {
2496     case GIMPLE_SINGLE_RHS:
2497       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2498       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2499       break;
2500     case GIMPLE_UNARY_RHS:
2501       reduction_op = gimple_assign_rhs1 (stmt);
2502       break;
2503     case GIMPLE_BINARY_RHS:
2504       reduction_op = gimple_assign_rhs2 (stmt);
2505       break;
2506     default:
2507       gcc_unreachable ();
2508     }
2509
2510   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2511   if (!vectype)
2512     {
2513       if (vect_print_dump_info (REPORT_COST))
2514         {
2515           fprintf (vect_dump, "unsupported data-type ");
2516           print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2517         }
2518       return false;
2519    }
2520
2521   mode = TYPE_MODE (vectype);
2522   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2523
2524   if (!orig_stmt)
2525     orig_stmt = STMT_VINFO_STMT (stmt_info);
2526
2527   code = gimple_assign_rhs_code (orig_stmt);
2528
2529   /* Add in cost for initial definition.  */
2530   outer_cost += vect_get_cost (scalar_to_vec);
2531
2532   /* Determine cost of epilogue code.
2533
2534      We have a reduction operator that will reduce the vector in one statement.
2535      Also requires scalar extract.  */
2536
2537   if (!nested_in_vect_loop_p (loop, orig_stmt))
2538     {
2539       if (reduc_code != ERROR_MARK)
2540         outer_cost += vect_get_cost (vector_stmt)
2541                       + vect_get_cost (vec_to_scalar);
2542       else
2543         {
2544           int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2545           tree bitsize =
2546             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2547           int element_bitsize = tree_low_cst (bitsize, 1);
2548           int nelements = vec_size_in_bits / element_bitsize;
2549
2550           optab = optab_for_tree_code (code, vectype, optab_default);
2551
2552           /* We have a whole vector shift available.  */
2553           if (VECTOR_MODE_P (mode)
2554               && optab_handler (optab, mode) != CODE_FOR_nothing
2555               && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
2556             /* Final reduction via vector shifts and the reduction operator. Also
2557                requires scalar extract.  */
2558             outer_cost += ((exact_log2(nelements) * 2)
2559               * vect_get_cost (vector_stmt)
2560               + vect_get_cost (vec_to_scalar));
2561           else
2562             /* Use extracts and reduction op for final reduction.  For N elements,
2563                we have N extracts and N-1 reduction ops.  */
2564             outer_cost += ((nelements + nelements - 1)
2565               * vect_get_cost (vector_stmt));
2566         }
2567     }
2568
2569   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2570
2571   if (vect_print_dump_info (REPORT_COST))
2572     fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2573              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2574              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2575
2576   return true;
2577 }
2578
2579
2580 /* Function vect_model_induction_cost.
2581
2582    Models cost for induction operations.  */
2583
2584 static void
2585 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2586 {
2587   /* loop cost for vec_loop.  */
2588   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2589     = ncopies * vect_get_cost (vector_stmt);
2590   /* prologue cost for vec_init and vec_step.  */
2591   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)
2592     = 2 * vect_get_cost (scalar_to_vec);
2593
2594   if (vect_print_dump_info (REPORT_COST))
2595     fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2596              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2597              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2598 }
2599
2600
2601 /* Function get_initial_def_for_induction
2602
2603    Input:
2604    STMT - a stmt that performs an induction operation in the loop.
2605    IV_PHI - the initial value of the induction variable
2606
2607    Output:
2608    Return a vector variable, initialized with the first VF values of
2609    the induction variable.  E.g., for an iv with IV_PHI='X' and
2610    evolution S, for a vector of 4 units, we want to return:
2611    [X, X + S, X + 2*S, X + 3*S].  */
2612
2613 static tree
2614 get_initial_def_for_induction (gimple iv_phi)
2615 {
2616   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2617   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2618   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2619   tree scalar_type;
2620   tree vectype;
2621   int nunits;
2622   edge pe = loop_preheader_edge (loop);
2623   struct loop *iv_loop;
2624   basic_block new_bb;
2625   tree vec, vec_init, vec_step, t;
2626   tree access_fn;
2627   tree new_var;
2628   tree new_name;
2629   gimple init_stmt, induction_phi, new_stmt;
2630   tree induc_def, vec_def, vec_dest;
2631   tree init_expr, step_expr;
2632   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2633   int i;
2634   bool ok;
2635   int ncopies;
2636   tree expr;
2637   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2638   bool nested_in_vect_loop = false;
2639   gimple_seq stmts = NULL;
2640   imm_use_iterator imm_iter;
2641   use_operand_p use_p;
2642   gimple exit_phi;
2643   edge latch_e;
2644   tree loop_arg;
2645   gimple_stmt_iterator si;
2646   basic_block bb = gimple_bb (iv_phi);
2647   tree stepvectype;
2648   tree resvectype;
2649
2650   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
2651   if (nested_in_vect_loop_p (loop, iv_phi))
2652     {
2653       nested_in_vect_loop = true;
2654       iv_loop = loop->inner;
2655     }
2656   else
2657     iv_loop = loop;
2658   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2659
2660   latch_e = loop_latch_edge (iv_loop);
2661   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2662
2663   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2664   gcc_assert (access_fn);
2665   STRIP_NOPS (access_fn);
2666   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2667                                     &init_expr, &step_expr);
2668   gcc_assert (ok);
2669   pe = loop_preheader_edge (iv_loop);
2670
2671   scalar_type = TREE_TYPE (init_expr);
2672   vectype = get_vectype_for_scalar_type (scalar_type);
2673   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
2674   gcc_assert (vectype);
2675   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2676   ncopies = vf / nunits;
2677
2678   gcc_assert (phi_info);
2679   gcc_assert (ncopies >= 1);
2680
2681   /* Find the first insertion point in the BB.  */
2682   si = gsi_after_labels (bb);
2683
2684   /* Create the vector that holds the initial_value of the induction.  */
2685   if (nested_in_vect_loop)
2686     {
2687       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
2688          been created during vectorization of previous stmts.  We obtain it
2689          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
2690       tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2691                                            loop_preheader_edge (iv_loop));
2692       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2693     }
2694   else
2695     {
2696       /* iv_loop is the loop to be vectorized. Create:
2697          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
2698       new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2699       add_referenced_var (new_var);
2700
2701       new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2702       if (stmts)
2703         {
2704           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2705           gcc_assert (!new_bb);
2706         }
2707
2708       t = NULL_TREE;
2709       t = tree_cons (NULL_TREE, new_name, t);
2710       for (i = 1; i < nunits; i++)
2711         {
2712           /* Create: new_name_i = new_name + step_expr  */
2713           enum tree_code code = POINTER_TYPE_P (scalar_type)
2714                                 ? POINTER_PLUS_EXPR : PLUS_EXPR;
2715           init_stmt = gimple_build_assign_with_ops (code, new_var,
2716                                                     new_name, step_expr);
2717           new_name = make_ssa_name (new_var, init_stmt);
2718           gimple_assign_set_lhs (init_stmt, new_name);
2719
2720           new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
2721           gcc_assert (!new_bb);
2722
2723           if (vect_print_dump_info (REPORT_DETAILS))
2724             {
2725               fprintf (vect_dump, "created new init_stmt: ");
2726               print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
2727             }
2728           t = tree_cons (NULL_TREE, new_name, t);
2729         }
2730       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
2731       vec = build_constructor_from_list (vectype, nreverse (t));
2732       vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
2733     }
2734
2735
2736   /* Create the vector that holds the step of the induction.  */
2737   if (nested_in_vect_loop)
2738     /* iv_loop is nested in the loop to be vectorized. Generate:
2739        vec_step = [S, S, S, S]  */
2740     new_name = step_expr;
2741   else
2742     {
2743       /* iv_loop is the loop to be vectorized. Generate:
2744           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
2745       expr = build_int_cst (TREE_TYPE (step_expr), vf);
2746       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2747                               expr, step_expr);
2748     }
2749
2750   t = unshare_expr (new_name);
2751   gcc_assert (CONSTANT_CLASS_P (new_name));
2752   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
2753   gcc_assert (stepvectype);
2754   vec = build_vector_from_val (stepvectype, t);
2755   vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2756
2757
2758   /* Create the following def-use cycle:
2759      loop prolog:
2760          vec_init = ...
2761          vec_step = ...
2762      loop:
2763          vec_iv = PHI <vec_init, vec_loop>
2764          ...
2765          STMT
2766          ...
2767          vec_loop = vec_iv + vec_step;  */
2768
2769   /* Create the induction-phi that defines the induction-operand.  */
2770   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
2771   add_referenced_var (vec_dest);
2772   induction_phi = create_phi_node (vec_dest, iv_loop->header);
2773   set_vinfo_for_stmt (induction_phi,
2774                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
2775   induc_def = PHI_RESULT (induction_phi);
2776
2777   /* Create the iv update inside the loop  */
2778   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2779                                            induc_def, vec_step);
2780   vec_def = make_ssa_name (vec_dest, new_stmt);
2781   gimple_assign_set_lhs (new_stmt, vec_def);
2782   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2783   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
2784                                                    NULL));
2785
2786   /* Set the arguments of the phi node:  */
2787   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
2788   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
2789                UNKNOWN_LOCATION);
2790
2791
2792   /* In case that vectorization factor (VF) is bigger than the number
2793      of elements that we can fit in a vectype (nunits), we have to generate
2794      more than one vector stmt - i.e - we need to "unroll" the
2795      vector stmt by a factor VF/nunits.  For more details see documentation
2796      in vectorizable_operation.  */
2797
2798   if (ncopies > 1)
2799     {
2800       stmt_vec_info prev_stmt_vinfo;
2801       /* FORNOW. This restriction should be relaxed.  */
2802       gcc_assert (!nested_in_vect_loop);
2803
2804       /* Create the vector that holds the step of the induction.  */
2805       expr = build_int_cst (TREE_TYPE (step_expr), nunits);
2806       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2807                               expr, step_expr);
2808       t = unshare_expr (new_name);
2809       gcc_assert (CONSTANT_CLASS_P (new_name));
2810       vec = build_vector_from_val (stepvectype, t);
2811       vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2812
2813       vec_def = induc_def;
2814       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
2815       for (i = 1; i < ncopies; i++)
2816         {
2817           /* vec_i = vec_prev + vec_step  */
2818           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2819                                                    vec_def, vec_step);
2820           vec_def = make_ssa_name (vec_dest, new_stmt);
2821           gimple_assign_set_lhs (new_stmt, vec_def);
2822
2823           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2824           if (!useless_type_conversion_p (resvectype, vectype))
2825             {
2826               new_stmt = gimple_build_assign_with_ops
2827                   (VIEW_CONVERT_EXPR,
2828                    vect_get_new_vect_var (resvectype, vect_simple_var,
2829                                           "vec_iv_"),
2830                    build1 (VIEW_CONVERT_EXPR, resvectype,
2831                            gimple_assign_lhs (new_stmt)), NULL_TREE);
2832               gimple_assign_set_lhs (new_stmt,
2833                                      make_ssa_name
2834                                        (gimple_assign_lhs (new_stmt), new_stmt));
2835               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2836             }
2837           set_vinfo_for_stmt (new_stmt,
2838                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2839           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
2840           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
2841         }
2842     }
2843
2844   if (nested_in_vect_loop)
2845     {
2846       /* Find the loop-closed exit-phi of the induction, and record
2847          the final vector of induction results:  */
2848       exit_phi = NULL;
2849       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
2850         {
2851           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
2852             {
2853               exit_phi = USE_STMT (use_p);
2854               break;
2855             }
2856         }
2857       if (exit_phi)
2858         {
2859           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2860           /* FORNOW. Currently not supporting the case that an inner-loop induction
2861              is not used in the outer-loop (i.e. only outside the outer-loop).  */
2862           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2863                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
2864
2865           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
2866           if (vect_print_dump_info (REPORT_DETAILS))
2867             {
2868               fprintf (vect_dump, "vector of inductions after inner-loop:");
2869               print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
2870             }
2871         }
2872     }
2873
2874
2875   if (vect_print_dump_info (REPORT_DETAILS))
2876     {
2877       fprintf (vect_dump, "transform induction: created def-use cycle: ");
2878       print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
2879       fprintf (vect_dump, "\n");
2880       print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
2881     }
2882
2883   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
2884   if (!useless_type_conversion_p (resvectype, vectype))
2885     {
2886       new_stmt = gimple_build_assign_with_ops
2887          (VIEW_CONVERT_EXPR,
2888           vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
2889           build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
2890       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
2891       gimple_assign_set_lhs (new_stmt, induc_def);
2892       si = gsi_start_bb (bb);
2893       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2894       set_vinfo_for_stmt (new_stmt,
2895                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2896       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
2897         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
2898     }
2899
2900   return induc_def;
2901 }
2902
2903
2904 /* Function get_initial_def_for_reduction
2905
2906    Input:
2907    STMT - a stmt that performs a reduction operation in the loop.
2908    INIT_VAL - the initial value of the reduction variable
2909
2910    Output:
2911    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2912         of the reduction (used for adjusting the epilog - see below).
2913    Return a vector variable, initialized according to the operation that STMT
2914         performs. This vector will be used as the initial value of the
2915         vector of partial results.
2916
2917    Option1 (adjust in epilog): Initialize the vector as follows:
2918      add/bit or/xor:    [0,0,...,0,0]
2919      mult/bit and:      [1,1,...,1,1]
2920      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
2921    and when necessary (e.g. add/mult case) let the caller know
2922    that it needs to adjust the result by init_val.
2923
2924    Option2: Initialize the vector as follows:
2925      add/bit or/xor:    [init_val,0,0,...,0]
2926      mult/bit and:      [init_val,1,1,...,1]
2927      min/max/cond_expr: [init_val,init_val,...,init_val]
2928    and no adjustments are needed.
2929
2930    For example, for the following code:
2931
2932    s = init_val;
2933    for (i=0;i<n;i++)
2934      s = s + a[i];
2935
2936    STMT is 's = s + a[i]', and the reduction variable is 's'.
2937    For a vector of 4 units, we want to return either [0,0,0,init_val],
2938    or [0,0,0,0] and let the caller know that it needs to adjust
2939    the result at the end by 'init_val'.
2940
2941    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2942    initialization vector is simpler (same element in all entries), if
2943    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
2944
2945    A cost model should help decide between these two schemes.  */
2946
2947 tree
2948 get_initial_def_for_reduction (gimple stmt, tree init_val,
2949                                tree *adjustment_def)
2950 {
2951   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2952   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2953   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2954   tree scalar_type = TREE_TYPE (init_val);
2955   tree vectype = get_vectype_for_scalar_type (scalar_type);
2956   int nunits;
2957   enum tree_code code = gimple_assign_rhs_code (stmt);
2958   tree def_for_init;
2959   tree init_def;
2960   tree t = NULL_TREE;
2961   int i;
2962   bool nested_in_vect_loop = false;
2963   tree init_value;
2964   REAL_VALUE_TYPE real_init_val = dconst0;
2965   int int_init_val = 0;
2966   gimple def_stmt = NULL;
2967
2968   gcc_assert (vectype);
2969   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2970
2971   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
2972               || SCALAR_FLOAT_TYPE_P (scalar_type));
2973
2974   if (nested_in_vect_loop_p (loop, stmt))
2975     nested_in_vect_loop = true;
2976   else
2977     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2978
2979   /* In case of double reduction we only create a vector variable to be put
2980      in the reduction phi node.  The actual statement creation is done in
2981      vect_create_epilog_for_reduction.  */
2982   if (adjustment_def && nested_in_vect_loop
2983       && TREE_CODE (init_val) == SSA_NAME
2984       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
2985       && gimple_code (def_stmt) == GIMPLE_PHI
2986       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2987       && vinfo_for_stmt (def_stmt)
2988       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2989           == vect_double_reduction_def)
2990     {
2991       *adjustment_def = NULL;
2992       return vect_create_destination_var (init_val, vectype);
2993     }
2994
2995   if (TREE_CONSTANT (init_val))
2996     {
2997       if (SCALAR_FLOAT_TYPE_P (scalar_type))
2998         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
2999       else
3000         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3001     }
3002   else
3003     init_value = init_val;
3004
3005   switch (code)
3006     {
3007       case WIDEN_SUM_EXPR:
3008       case DOT_PROD_EXPR:
3009       case PLUS_EXPR:
3010       case MINUS_EXPR:
3011       case BIT_IOR_EXPR:
3012       case BIT_XOR_EXPR:
3013       case MULT_EXPR:
3014       case BIT_AND_EXPR:
3015         /* ADJUSMENT_DEF is NULL when called from
3016            vect_create_epilog_for_reduction to vectorize double reduction.  */
3017         if (adjustment_def)
3018           {
3019             if (nested_in_vect_loop)
3020               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3021                                                               NULL);
3022             else
3023               *adjustment_def = init_val;
3024           }
3025
3026         if (code == MULT_EXPR)
3027           {
3028             real_init_val = dconst1;
3029             int_init_val = 1;
3030           }
3031
3032         if (code == BIT_AND_EXPR)
3033           int_init_val = -1;
3034
3035         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3036           def_for_init = build_real (scalar_type, real_init_val);
3037         else
3038           def_for_init = build_int_cst (scalar_type, int_init_val);
3039
3040         /* Create a vector of '0' or '1' except the first element.  */
3041         for (i = nunits - 2; i >= 0; --i)
3042           t = tree_cons (NULL_TREE, def_for_init, t);
3043
3044         /* Option1: the first element is '0' or '1' as well.  */
3045         if (adjustment_def)
3046           {
3047             t = tree_cons (NULL_TREE, def_for_init, t);
3048             init_def = build_vector (vectype, t);
3049             break;
3050           }
3051
3052         /* Option2: the first element is INIT_VAL.  */
3053         t = tree_cons (NULL_TREE, init_value, t);
3054         if (TREE_CONSTANT (init_val))
3055           init_def = build_vector (vectype, t);
3056         else
3057           init_def = build_constructor_from_list (vectype, t);
3058
3059         break;
3060
3061       case MIN_EXPR:
3062       case MAX_EXPR:
3063       case COND_EXPR:
3064         if (adjustment_def)
3065           {
3066             *adjustment_def = NULL_TREE;
3067             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3068             break;
3069           }
3070
3071         init_def = build_vector_from_val (vectype, init_value);
3072         break;
3073
3074       default:
3075         gcc_unreachable ();
3076     }
3077
3078   return init_def;
3079 }
3080
3081
3082 /* Function vect_create_epilog_for_reduction
3083
3084    Create code at the loop-epilog to finalize the result of a reduction
3085    computation.
3086
3087    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3088      reduction statements.
3089    STMT is the scalar reduction stmt that is being vectorized.
3090    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3091      number of elements that we can fit in a vectype (nunits).  In this case
3092      we have to generate more than one vector stmt - i.e - we need to "unroll"
3093      the vector stmt by a factor VF/nunits.  For more details see documentation
3094      in vectorizable_operation.
3095    REDUC_CODE is the tree-code for the epilog reduction.
3096    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3097      computation.
3098    REDUC_INDEX is the index of the operand in the right hand side of the
3099      statement that is defined by REDUCTION_PHI.
3100    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3101    SLP_NODE is an SLP node containing a group of reduction statements. The
3102      first one in this group is STMT.
3103
3104    This function:
3105    1. Creates the reduction def-use cycles: sets the arguments for
3106       REDUCTION_PHIS:
3107       The loop-entry argument is the vectorized initial-value of the reduction.
3108       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3109       sums.
3110    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3111       by applying the operation specified by REDUC_CODE if available, or by
3112       other means (whole-vector shifts or a scalar loop).
3113       The function also creates a new phi node at the loop exit to preserve
3114       loop-closed form, as illustrated below.
3115
3116      The flow at the entry to this function:
3117
3118         loop:
3119           vec_def = phi <null, null>            # REDUCTION_PHI
3120           VECT_DEF = vector_stmt                # vectorized form of STMT
3121           s_loop = scalar_stmt                  # (scalar) STMT
3122         loop_exit:
3123           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3124           use <s_out0>
3125           use <s_out0>
3126
3127      The above is transformed by this function into:
3128
3129         loop:
3130           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3131           VECT_DEF = vector_stmt                # vectorized form of STMT
3132           s_loop = scalar_stmt                  # (scalar) STMT
3133         loop_exit:
3134           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3135           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3136           v_out2 = reduce <v_out1>
3137           s_out3 = extract_field <v_out2, 0>
3138           s_out4 = adjust_result <s_out3>
3139           use <s_out4>
3140           use <s_out4>
3141 */
3142
3143 static void
3144 vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
3145                                   int ncopies, enum tree_code reduc_code,
3146                                   VEC (gimple, heap) *reduction_phis,
3147                                   int reduc_index, bool double_reduc,
3148                                   slp_tree slp_node)
3149 {
3150   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3151   stmt_vec_info prev_phi_info;
3152   tree vectype;
3153   enum machine_mode mode;
3154   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3155   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3156   basic_block exit_bb;
3157   tree scalar_dest;
3158   tree scalar_type;
3159   gimple new_phi = NULL, phi;
3160   gimple_stmt_iterator exit_gsi;
3161   tree vec_dest;
3162   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3163   gimple epilog_stmt = NULL;
3164   enum tree_code code = gimple_assign_rhs_code (stmt);
3165   gimple exit_phi;
3166   tree bitsize, bitpos;
3167   tree adjustment_def = NULL;
3168   tree vec_initial_def = NULL;
3169   tree reduction_op, expr, def;
3170   tree orig_name, scalar_result;
3171   imm_use_iterator imm_iter, phi_imm_iter;
3172   use_operand_p use_p, phi_use_p;
3173   bool extract_scalar_result = false;
3174   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3175   bool nested_in_vect_loop = false;
3176   VEC (gimple, heap) *new_phis = NULL;
3177   enum vect_def_type dt = vect_unknown_def_type;
3178   int j, i;
3179   VEC (tree, heap) *scalar_results = NULL;
3180   unsigned int group_size = 1, k, ratio;
3181   VEC (tree, heap) *vec_initial_defs = NULL;
3182   VEC (gimple, heap) *phis;
3183
3184   if (slp_node)
3185     group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
3186
3187   if (nested_in_vect_loop_p (loop, stmt))
3188     {
3189       outer_loop = loop;
3190       loop = loop->inner;
3191       nested_in_vect_loop = true;
3192       gcc_assert (!slp_node);
3193     }
3194
3195   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3196     {
3197     case GIMPLE_SINGLE_RHS:
3198       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3199                                        == ternary_op);
3200       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3201       break;
3202     case GIMPLE_UNARY_RHS:
3203       reduction_op = gimple_assign_rhs1 (stmt);
3204       break;
3205     case GIMPLE_BINARY_RHS:
3206       reduction_op = reduc_index ?
3207                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3208       break;
3209     default:
3210       gcc_unreachable ();
3211     }
3212
3213   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3214   gcc_assert (vectype);
3215   mode = TYPE_MODE (vectype);
3216
3217   /* 1. Create the reduction def-use cycle:
3218      Set the arguments of REDUCTION_PHIS, i.e., transform
3219
3220         loop:
3221           vec_def = phi <null, null>            # REDUCTION_PHI
3222           VECT_DEF = vector_stmt                # vectorized form of STMT
3223           ...
3224
3225      into:
3226
3227         loop:
3228           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3229           VECT_DEF = vector_stmt                # vectorized form of STMT
3230           ...
3231
3232      (in case of SLP, do it for all the phis). */
3233
3234   /* Get the loop-entry arguments.  */
3235   if (slp_node)
3236     vect_get_slp_defs (reduction_op, NULL_TREE, slp_node, &vec_initial_defs,
3237                        NULL, reduc_index);
3238   else
3239     {
3240       vec_initial_defs = VEC_alloc (tree, heap, 1);
3241      /* For the case of reduction, vect_get_vec_def_for_operand returns
3242         the scalar def before the loop, that defines the initial value
3243         of the reduction variable.  */
3244       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3245                                                       &adjustment_def);
3246       VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
3247     }
3248
3249   /* Set phi nodes arguments.  */
3250   FOR_EACH_VEC_ELT (gimple, reduction_phis, i, phi)
3251     {
3252       tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
3253       tree def = VEC_index (tree, vect_defs, i);
3254       for (j = 0; j < ncopies; j++)
3255         {
3256           /* Set the loop-entry arg of the reduction-phi.  */
3257           add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3258                        UNKNOWN_LOCATION);
3259
3260           /* Set the loop-latch arg for the reduction-phi.  */
3261           if (j > 0)
3262             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3263
3264           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3265
3266           if (vect_print_dump_info (REPORT_DETAILS))
3267             {
3268               fprintf (vect_dump, "transform reduction: created def-use"
3269                                   " cycle: ");
3270               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
3271               fprintf (vect_dump, "\n");
3272               print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0,
3273                                  TDF_SLIM);
3274             }
3275
3276           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3277         }
3278     }
3279
3280   VEC_free (tree, heap, vec_initial_defs);
3281
3282   /* 2. Create epilog code.
3283         The reduction epilog code operates across the elements of the vector
3284         of partial results computed by the vectorized loop.
3285         The reduction epilog code consists of:
3286
3287         step 1: compute the scalar result in a vector (v_out2)
3288         step 2: extract the scalar result (s_out3) from the vector (v_out2)
3289         step 3: adjust the scalar result (s_out3) if needed.
3290
3291         Step 1 can be accomplished using one the following three schemes:
3292           (scheme 1) using reduc_code, if available.
3293           (scheme 2) using whole-vector shifts, if available.
3294           (scheme 3) using a scalar loop. In this case steps 1+2 above are
3295                      combined.
3296
3297           The overall epilog code looks like this:
3298
3299           s_out0 = phi <s_loop>         # original EXIT_PHI
3300           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
3301           v_out2 = reduce <v_out1>              # step 1
3302           s_out3 = extract_field <v_out2, 0>    # step 2
3303           s_out4 = adjust_result <s_out3>       # step 3
3304
3305           (step 3 is optional, and steps 1 and 2 may be combined).
3306           Lastly, the uses of s_out0 are replaced by s_out4.  */
3307
3308
3309   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3310          v_out1 = phi <VECT_DEF>
3311          Store them in NEW_PHIS.  */
3312
3313   exit_bb = single_exit (loop)->dest;
3314   prev_phi_info = NULL;
3315   new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3316   FOR_EACH_VEC_ELT (tree, vect_defs, i, def)
3317     {
3318       for (j = 0; j < ncopies; j++)
3319         {
3320           phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
3321           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3322           if (j == 0)
3323             VEC_quick_push (gimple, new_phis, phi);
3324           else
3325             {
3326               def = vect_get_vec_def_for_stmt_copy (dt, def);
3327               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3328             }
3329
3330           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3331           prev_phi_info = vinfo_for_stmt (phi);
3332         }
3333     }
3334
3335   /* The epilogue is created for the outer-loop, i.e., for the loop being
3336      vectorized.  */
3337   if (double_reduc)
3338     {
3339       loop = outer_loop;
3340       exit_bb = single_exit (loop)->dest;
3341     }
3342
3343   exit_gsi = gsi_after_labels (exit_bb);
3344
3345   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3346          (i.e. when reduc_code is not available) and in the final adjustment
3347          code (if needed).  Also get the original scalar reduction variable as
3348          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
3349          represents a reduction pattern), the tree-code and scalar-def are
3350          taken from the original stmt that the pattern-stmt (STMT) replaces.
3351          Otherwise (it is a regular reduction) - the tree-code and scalar-def
3352          are taken from STMT.  */
3353
3354   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3355   if (!orig_stmt)
3356     {
3357       /* Regular reduction  */
3358       orig_stmt = stmt;
3359     }
3360   else
3361     {
3362       /* Reduction pattern  */
3363       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3364       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3365       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3366     }
3367
3368   code = gimple_assign_rhs_code (orig_stmt);
3369   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3370      partial results are added and not subtracted.  */
3371   if (code == MINUS_EXPR)
3372     code = PLUS_EXPR;
3373
3374   scalar_dest = gimple_assign_lhs (orig_stmt);
3375   scalar_type = TREE_TYPE (scalar_dest);
3376   scalar_results = VEC_alloc (tree, heap, group_size);
3377   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3378   bitsize = TYPE_SIZE (scalar_type);
3379
3380   /* In case this is a reduction in an inner-loop while vectorizing an outer
3381      loop - we don't need to extract a single scalar result at the end of the
3382      inner-loop (unless it is double reduction, i.e., the use of reduction is
3383      outside the outer-loop).  The final vector of partial results will be used
3384      in the vectorized outer-loop, or reduced to a scalar result at the end of
3385      the outer-loop.  */
3386   if (nested_in_vect_loop && !double_reduc)
3387     goto vect_finalize_reduction;
3388
3389   /* 2.3 Create the reduction code, using one of the three schemes described
3390          above. In SLP we simply need to extract all the elements from the
3391          vector (without reducing them), so we use scalar shifts.  */
3392   if (reduc_code != ERROR_MARK && !slp_node)
3393     {
3394       tree tmp;
3395
3396       /*** Case 1:  Create:
3397            v_out2 = reduc_expr <v_out1>  */
3398
3399       if (vect_print_dump_info (REPORT_DETAILS))
3400         fprintf (vect_dump, "Reduce using direct vector reduction.");
3401
3402       vec_dest = vect_create_destination_var (scalar_dest, vectype);
3403       new_phi = VEC_index (gimple, new_phis, 0);
3404       tmp = build1 (reduc_code, vectype,  PHI_RESULT (new_phi));
3405       epilog_stmt = gimple_build_assign (vec_dest, tmp);
3406       new_temp = make_ssa_name (vec_dest, epilog_stmt);
3407       gimple_assign_set_lhs (epilog_stmt, new_temp);
3408       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3409
3410       extract_scalar_result = true;
3411     }
3412   else
3413     {
3414       enum tree_code shift_code = ERROR_MARK;
3415       bool have_whole_vector_shift = true;
3416       int bit_offset;
3417       int element_bitsize = tree_low_cst (bitsize, 1);
3418       int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3419       tree vec_temp;
3420
3421       if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3422         shift_code = VEC_RSHIFT_EXPR;
3423       else
3424         have_whole_vector_shift = false;
3425
3426       /* Regardless of whether we have a whole vector shift, if we're
3427          emulating the operation via tree-vect-generic, we don't want
3428          to use it.  Only the first round of the reduction is likely
3429          to still be profitable via emulation.  */
3430       /* ??? It might be better to emit a reduction tree code here, so that
3431          tree-vect-generic can expand the first round via bit tricks.  */
3432       if (!VECTOR_MODE_P (mode))
3433         have_whole_vector_shift = false;
3434       else
3435         {
3436           optab optab = optab_for_tree_code (code, vectype, optab_default);
3437           if (optab_handler (optab, mode) == CODE_FOR_nothing)
3438             have_whole_vector_shift = false;
3439         }
3440
3441       if (have_whole_vector_shift && !slp_node)
3442         {
3443           /*** Case 2: Create:
3444              for (offset = VS/2; offset >= element_size; offset/=2)
3445                 {
3446                   Create:  va' = vec_shift <va, offset>
3447                   Create:  va = vop <va, va'>
3448                 }  */
3449
3450           if (vect_print_dump_info (REPORT_DETAILS))
3451             fprintf (vect_dump, "Reduce using vector shifts");
3452
3453           vec_dest = vect_create_destination_var (scalar_dest, vectype);
3454           new_phi = VEC_index (gimple, new_phis, 0);
3455           new_temp = PHI_RESULT (new_phi);
3456           for (bit_offset = vec_size_in_bits/2;
3457                bit_offset >= element_bitsize;
3458                bit_offset /= 2)
3459             {
3460               tree bitpos = size_int (bit_offset);
3461
3462               epilog_stmt = gimple_build_assign_with_ops (shift_code,
3463                                                vec_dest, new_temp, bitpos);
3464               new_name = make_ssa_name (vec_dest, epilog_stmt);
3465               gimple_assign_set_lhs (epilog_stmt, new_name);
3466               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3467
3468               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3469                                                           new_name, new_temp);
3470               new_temp = make_ssa_name (vec_dest, epilog_stmt);
3471               gimple_assign_set_lhs (epilog_stmt, new_temp);
3472               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3473             }
3474
3475           extract_scalar_result = true;
3476         }
3477       else
3478         {
3479           tree rhs;
3480
3481           /*** Case 3: Create:
3482              s = extract_field <v_out2, 0>
3483              for (offset = element_size;
3484                   offset < vector_size;
3485                   offset += element_size;)
3486                {
3487                  Create:  s' = extract_field <v_out2, offset>
3488                  Create:  s = op <s, s'>  // For non SLP cases
3489                }  */
3490
3491           if (vect_print_dump_info (REPORT_DETAILS))
3492             fprintf (vect_dump, "Reduce using scalar code. ");
3493
3494           vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3495           FOR_EACH_VEC_ELT (gimple, new_phis, i, new_phi)
3496             {
3497               vec_temp = PHI_RESULT (new_phi);
3498               rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3499                             bitsize_zero_node);
3500               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3501               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3502               gimple_assign_set_lhs (epilog_stmt, new_temp);
3503               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3504
3505               /* In SLP we don't need to apply reduction operation, so we just
3506                  collect s' values in SCALAR_RESULTS.  */
3507               if (slp_node)
3508                 VEC_safe_push (tree, heap, scalar_results, new_temp);
3509
3510               for (bit_offset = element_bitsize;
3511                    bit_offset < vec_size_in_bits;
3512                    bit_offset += element_bitsize)
3513                 {
3514                   tree bitpos = bitsize_int (bit_offset);
3515                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
3516                                      bitsize, bitpos);
3517
3518                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3519                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3520                   gimple_assign_set_lhs (epilog_stmt, new_name);
3521                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3522
3523                   if (slp_node)
3524                     {
3525                       /* In SLP we don't need to apply reduction operation, so
3526                          we just collect s' values in SCALAR_RESULTS.  */
3527                       new_temp = new_name;
3528                       VEC_safe_push (tree, heap, scalar_results, new_name);
3529                     }
3530                   else
3531                     {
3532                       epilog_stmt = gimple_build_assign_with_ops (code,
3533                                           new_scalar_dest, new_name, new_temp);
3534                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3535                       gimple_assign_set_lhs (epilog_stmt, new_temp);
3536                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3537                     }
3538                 }
3539             }
3540
3541           /* The only case where we need to reduce scalar results in SLP, is
3542              unrolling.  If the size of SCALAR_RESULTS is greater than
3543              GROUP_SIZE, we reduce them combining elements modulo
3544              GROUP_SIZE.  */
3545           if (slp_node)
3546             {
3547               tree res, first_res, new_res;
3548               gimple new_stmt;
3549
3550               /* Reduce multiple scalar results in case of SLP unrolling.  */
3551               for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
3552                    j++)
3553                 {
3554                   first_res = VEC_index (tree, scalar_results, j % group_size);
3555                   new_stmt = gimple_build_assign_with_ops (code,
3556                                               new_scalar_dest, first_res, res);
3557                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
3558                   gimple_assign_set_lhs (new_stmt, new_res);
3559                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
3560                   VEC_replace (tree, scalar_results, j % group_size, new_res);
3561                 }
3562             }
3563           else
3564             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
3565             VEC_safe_push (tree, heap, scalar_results, new_temp);
3566
3567           extract_scalar_result = false;
3568         }
3569     }
3570
3571   /* 2.4  Extract the final scalar result.  Create:
3572           s_out3 = extract_field <v_out2, bitpos>  */
3573
3574   if (extract_scalar_result)
3575     {
3576       tree rhs;
3577
3578       if (vect_print_dump_info (REPORT_DETAILS))
3579         fprintf (vect_dump, "extract scalar result");
3580
3581       if (BYTES_BIG_ENDIAN)
3582         bitpos = size_binop (MULT_EXPR,
3583                              bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3584                              TYPE_SIZE (scalar_type));
3585       else
3586         bitpos = bitsize_zero_node;
3587
3588       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3589       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3590       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3591       gimple_assign_set_lhs (epilog_stmt, new_temp);
3592       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3593       VEC_safe_push (tree, heap, scalar_results, new_temp);
3594     }
3595
3596 vect_finalize_reduction:
3597
3598   if (double_reduc)
3599     loop = loop->inner;
3600
3601   /* 2.5 Adjust the final result by the initial value of the reduction
3602          variable. (When such adjustment is not needed, then
3603          'adjustment_def' is zero).  For example, if code is PLUS we create:
3604          new_temp = loop_exit_def + adjustment_def  */
3605
3606   if (adjustment_def)
3607     {
3608       gcc_assert (!slp_node);
3609       if (nested_in_vect_loop)
3610         {
3611           new_phi = VEC_index (gimple, new_phis, 0);
3612           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3613           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3614           new_dest = vect_create_destination_var (scalar_dest, vectype);
3615         }
3616       else
3617         {
3618           new_temp = VEC_index (tree, scalar_results, 0);
3619           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3620           expr = build2 (code, scalar_type, new_temp, adjustment_def);
3621           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3622         }
3623
3624       epilog_stmt = gimple_build_assign (new_dest, expr);
3625       new_temp = make_ssa_name (new_dest, epilog_stmt);
3626       gimple_assign_set_lhs (epilog_stmt, new_temp);
3627       SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3628       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3629       if (nested_in_vect_loop)
3630         {
3631           set_vinfo_for_stmt (epilog_stmt,
3632                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
3633                                                  NULL));
3634           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3635                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3636
3637           if (!double_reduc)
3638             VEC_quick_push (tree, scalar_results, new_temp);
3639           else
3640             VEC_replace (tree, scalar_results, 0, new_temp);
3641         }
3642       else
3643         VEC_replace (tree, scalar_results, 0, new_temp);
3644
3645       VEC_replace (gimple, new_phis, 0, epilog_stmt);
3646     }
3647
3648   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
3649           phis with new adjusted scalar results, i.e., replace use <s_out0>
3650           with use <s_out4>.
3651
3652      Transform:
3653         loop_exit:
3654           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3655           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3656           v_out2 = reduce <v_out1>
3657           s_out3 = extract_field <v_out2, 0>
3658           s_out4 = adjust_result <s_out3>
3659           use <s_out0>
3660           use <s_out0>
3661
3662      into:
3663
3664         loop_exit:
3665           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3666           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3667           v_out2 = reduce <v_out1>
3668           s_out3 = extract_field <v_out2, 0>
3669           s_out4 = adjust_result <s_out3>
3670           use <s_out4>
3671           use <s_out4> */
3672
3673   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
3674      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
3675      need to match SCALAR_RESULTS with corresponding statements.  The first
3676      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
3677      the first vector stmt, etc.
3678      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
3679   if (group_size > VEC_length (gimple, new_phis))
3680     {
3681       ratio = group_size / VEC_length (gimple, new_phis);
3682       gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
3683     }
3684   else
3685     ratio = 1;
3686
3687   for (k = 0; k < group_size; k++)
3688     {
3689       if (k % ratio == 0)
3690         {
3691           epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
3692           reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
3693         }
3694
3695       if (slp_node)
3696         {
3697           gimple current_stmt = VEC_index (gimple,
3698                                        SLP_TREE_SCALAR_STMTS (slp_node), k);
3699
3700           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
3701           /* SLP statements can't participate in patterns.  */
3702           gcc_assert (!orig_stmt);
3703           scalar_dest = gimple_assign_lhs (current_stmt);
3704         }
3705
3706       phis = VEC_alloc (gimple, heap, 3);
3707       /* Find the loop-closed-use at the loop exit of the original scalar
3708          result.  (The reduction result is expected to have two immediate uses -
3709          one at the latch block, and one at the loop exit).  */
3710       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3711         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3712           VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
3713
3714       /* We expect to have found an exit_phi because of loop-closed-ssa
3715          form.  */
3716       gcc_assert (!VEC_empty (gimple, phis));
3717
3718       FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
3719         {
3720           if (outer_loop)
3721             {
3722               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
3723               gimple vect_phi;
3724
3725               /* FORNOW. Currently not supporting the case that an inner-loop
3726                  reduction is not used in the outer-loop (but only outside the
3727                  outer-loop), unless it is double reduction.  */
3728               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
3729                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
3730                           || double_reduc);
3731
3732               STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
3733               if (!double_reduc
3734                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
3735                       != vect_double_reduction_def)
3736                 continue;
3737
3738               /* Handle double reduction:
3739
3740                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
3741                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
3742                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
3743                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
3744
3745                  At that point the regular reduction (stmt2 and stmt3) is
3746                  already vectorized, as well as the exit phi node, stmt4.
3747                  Here we vectorize the phi node of double reduction, stmt1, and
3748                  update all relevant statements.  */
3749
3750               /* Go through all the uses of s2 to find double reduction phi
3751                  node, i.e., stmt1 above.  */
3752               orig_name = PHI_RESULT (exit_phi);
3753               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3754                 {
3755                   stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
3756                   stmt_vec_info new_phi_vinfo;
3757                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
3758                   basic_block bb = gimple_bb (use_stmt);
3759                   gimple use;
3760
3761                   /* Check that USE_STMT is really double reduction phi
3762                      node.  */
3763                   if (gimple_code (use_stmt) != GIMPLE_PHI
3764                       || gimple_phi_num_args (use_stmt) != 2
3765                       || !use_stmt_vinfo
3766                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
3767                           != vect_double_reduction_def
3768                       || bb->loop_father != outer_loop)
3769                     continue;
3770
3771                   /* Create vector phi node for double reduction:
3772                      vs1 = phi <vs0, vs2>
3773                      vs1 was created previously in this function by a call to
3774                        vect_get_vec_def_for_operand and is stored in
3775                        vec_initial_def;
3776                      vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
3777                      vs0 is created here.  */
3778
3779                   /* Create vector phi node.  */
3780                   vect_phi = create_phi_node (vec_initial_def, bb);
3781                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
3782                                     loop_vec_info_for_loop (outer_loop), NULL);
3783                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
3784
3785                   /* Create vs0 - initial def of the double reduction phi.  */
3786                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
3787                                              loop_preheader_edge (outer_loop));
3788                   init_def = get_initial_def_for_reduction (stmt,
3789                                                           preheader_arg, NULL);
3790                   vect_phi_init = vect_init_vector (use_stmt, init_def,
3791                                                     vectype, NULL);
3792
3793                   /* Update phi node arguments with vs0 and vs2.  */
3794                   add_phi_arg (vect_phi, vect_phi_init,
3795                                loop_preheader_edge (outer_loop),
3796                                UNKNOWN_LOCATION);
3797                   add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
3798                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
3799                   if (vect_print_dump_info (REPORT_DETAILS))
3800                     {
3801                       fprintf (vect_dump, "created double reduction phi "
3802                                           "node: ");
3803                       print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
3804                     }
3805
3806                   vect_phi_res = PHI_RESULT (vect_phi);
3807
3808                   /* Replace the use, i.e., set the correct vs1 in the regular
3809                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
3810                      loop is redundant.  */
3811                   use = reduction_phi;
3812                   for (j = 0; j < ncopies; j++)
3813                     {
3814                       edge pr_edge = loop_preheader_edge (loop);
3815                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
3816                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
3817                     }
3818                 }
3819             }
3820         }
3821
3822       VEC_free (gimple, heap, phis);
3823       if (nested_in_vect_loop)
3824         {
3825           if (double_reduc)
3826             loop = outer_loop;
3827           else
3828             continue;
3829         }
3830
3831       phis = VEC_alloc (gimple, heap, 3);
3832       /* Find the loop-closed-use at the loop exit of the original scalar
3833          result.  (The reduction result is expected to have two immediate uses,
3834          one at the latch block, and one at the loop exit).  For double
3835          reductions we are looking for exit phis of the outer loop.  */
3836       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3837         {
3838           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3839             VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
3840           else
3841             {
3842               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
3843                 {
3844                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
3845
3846                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
3847                     {
3848                       if (!flow_bb_inside_loop_p (loop,
3849                                              gimple_bb (USE_STMT (phi_use_p))))
3850                         VEC_safe_push (gimple, heap, phis,
3851                                        USE_STMT (phi_use_p));
3852                     }
3853                 }
3854             }
3855         }
3856
3857       FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
3858         {
3859           /* Replace the uses:  */
3860           orig_name = PHI_RESULT (exit_phi);
3861           scalar_result = VEC_index (tree, scalar_results, k);
3862           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3863             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3864               SET_USE (use_p, scalar_result);
3865         }
3866
3867       VEC_free (gimple, heap, phis);
3868     }
3869
3870   VEC_free (tree, heap, scalar_results);
3871   VEC_free (gimple, heap, new_phis);
3872 }
3873
3874
3875 /* Function vectorizable_reduction.
3876
3877    Check if STMT performs a reduction operation that can be vectorized.
3878    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3879    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
3880    Return FALSE if not a vectorizable STMT, TRUE otherwise.
3881
3882    This function also handles reduction idioms (patterns) that have been
3883    recognized in advance during vect_pattern_recog.  In this case, STMT may be
3884    of this form:
3885      X = pattern_expr (arg0, arg1, ..., X)
3886    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
3887    sequence that had been detected and replaced by the pattern-stmt (STMT).
3888
3889    In some cases of reduction patterns, the type of the reduction variable X is
3890    different than the type of the other arguments of STMT.
3891    In such cases, the vectype that is used when transforming STMT into a vector
3892    stmt is different than the vectype that is used to determine the
3893    vectorization factor, because it consists of a different number of elements
3894    than the actual number of elements that are being operated upon in parallel.
3895
3896    For example, consider an accumulation of shorts into an int accumulator.
3897    On some targets it's possible to vectorize this pattern operating on 8
3898    shorts at a time (hence, the vectype for purposes of determining the
3899    vectorization factor should be V8HI); on the other hand, the vectype that
3900    is used to create the vector form is actually V4SI (the type of the result).
3901
3902    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
3903    indicates what is the actual level of parallelism (V8HI in the example), so
3904    that the right vectorization factor would be derived.  This vectype
3905    corresponds to the type of arguments to the reduction stmt, and should *NOT*
3906    be used to create the vectorized stmt.  The right vectype for the vectorized
3907    stmt is obtained from the type of the result X:
3908         get_vectype_for_scalar_type (TREE_TYPE (X))
3909
3910    This means that, contrary to "regular" reductions (or "regular" stmts in
3911    general), the following equation:
3912       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
3913    does *NOT* necessarily hold for reduction patterns.  */
3914
3915 bool
3916 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
3917                         gimple *vec_stmt, slp_tree slp_node)
3918 {
3919   tree vec_dest;
3920   tree scalar_dest;
3921   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
3922   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3923   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
3924   tree vectype_in = NULL_TREE;
3925   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3926   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3927   enum tree_code code, orig_code, epilog_reduc_code;
3928   enum machine_mode vec_mode;
3929   int op_type;
3930   optab optab, reduc_optab;
3931   tree new_temp = NULL_TREE;
3932   tree def;
3933   gimple def_stmt;
3934   enum vect_def_type dt;
3935   gimple new_phi = NULL;
3936   tree scalar_type;
3937   bool is_simple_use;
3938   gimple orig_stmt;
3939   stmt_vec_info orig_stmt_info;
3940   tree expr = NULL_TREE;
3941   int i;
3942   int ncopies;
3943   int epilog_copies;
3944   stmt_vec_info prev_stmt_info, prev_phi_info;
3945   bool single_defuse_cycle = false;
3946   tree reduc_def = NULL_TREE;
3947   gimple new_stmt = NULL;
3948   int j;
3949   tree ops[3];
3950   bool nested_cycle = false, found_nested_cycle_def = false;
3951   gimple reduc_def_stmt = NULL;
3952   /* The default is that the reduction variable is the last in statement.  */
3953   int reduc_index = 2;
3954   bool double_reduc = false, dummy;
3955   basic_block def_bb;
3956   struct loop * def_stmt_loop, *outer_loop = NULL;
3957   tree def_arg;
3958   gimple def_arg_stmt;
3959   VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
3960   VEC (gimple, heap) *phis = NULL;
3961   int vec_num;
3962   tree def0, def1, tem;
3963
3964   if (nested_in_vect_loop_p (loop, stmt))
3965     {
3966       outer_loop = loop;
3967       loop = loop->inner;
3968       nested_cycle = true;
3969     }
3970
3971   /* 1. Is vectorizable reduction?  */
3972   /* Not supportable if the reduction variable is used in the loop.  */
3973   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
3974     return false;
3975
3976   /* Reductions that are not used even in an enclosing outer-loop,
3977      are expected to be "live" (used out of the loop).  */
3978   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
3979       && !STMT_VINFO_LIVE_P (stmt_info))
3980     return false;
3981
3982   /* Make sure it was already recognized as a reduction computation.  */
3983   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
3984       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
3985     return false;
3986
3987   /* 2. Has this been recognized as a reduction pattern?
3988
3989      Check if STMT represents a pattern that has been recognized
3990      in earlier analysis stages.  For stmts that represent a pattern,
3991      the STMT_VINFO_RELATED_STMT field records the last stmt in
3992      the original sequence that constitutes the pattern.  */
3993
3994   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3995   if (orig_stmt)
3996     {
3997       orig_stmt_info = vinfo_for_stmt (orig_stmt);
3998       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
3999       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4000       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4001     }
4002
4003   /* 3. Check the operands of the operation.  The first operands are defined
4004         inside the loop body. The last operand is the reduction variable,
4005         which is defined by the loop-header-phi.  */
4006
4007   gcc_assert (is_gimple_assign (stmt));
4008
4009   /* Flatten RHS.  */
4010   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4011     {
4012     case GIMPLE_SINGLE_RHS:
4013       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4014       if (op_type == ternary_op)
4015         {
4016           tree rhs = gimple_assign_rhs1 (stmt);
4017           ops[0] = TREE_OPERAND (rhs, 0);
4018           ops[1] = TREE_OPERAND (rhs, 1);
4019           ops[2] = TREE_OPERAND (rhs, 2);
4020           code = TREE_CODE (rhs);
4021         }
4022       else
4023         return false;
4024       break;
4025
4026     case GIMPLE_BINARY_RHS:
4027       code = gimple_assign_rhs_code (stmt);
4028       op_type = TREE_CODE_LENGTH (code);
4029       gcc_assert (op_type == binary_op);
4030       ops[0] = gimple_assign_rhs1 (stmt);
4031       ops[1] = gimple_assign_rhs2 (stmt);
4032       break;
4033
4034     case GIMPLE_UNARY_RHS:
4035       return false;
4036
4037     default:
4038       gcc_unreachable ();
4039     }
4040
4041   scalar_dest = gimple_assign_lhs (stmt);
4042   scalar_type = TREE_TYPE (scalar_dest);
4043   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4044       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4045     return false;
4046
4047   /* All uses but the last are expected to be defined in the loop.
4048      The last use is the reduction variable.  In case of nested cycle this
4049      assumption is not true: we use reduc_index to record the index of the
4050      reduction variable.  */
4051   for (i = 0; i < op_type-1; i++)
4052     {
4053       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4054       if (i == 0 && code == COND_EXPR)
4055         continue;
4056
4057       is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL,
4058                                             &def_stmt, &def, &dt, &tem);
4059       if (!vectype_in)
4060         vectype_in = tem;
4061       gcc_assert (is_simple_use);
4062       if (dt != vect_internal_def
4063           && dt != vect_external_def
4064           && dt != vect_constant_def
4065           && dt != vect_induction_def
4066           && !(dt == vect_nested_cycle && nested_cycle))
4067         return false;
4068
4069       if (dt == vect_nested_cycle)
4070         {
4071           found_nested_cycle_def = true;
4072           reduc_def_stmt = def_stmt;
4073           reduc_index = i;
4074         }
4075     }
4076
4077   is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL, &def_stmt,
4078                                         &def, &dt, &tem);
4079   if (!vectype_in)
4080     vectype_in = tem;
4081   gcc_assert (is_simple_use);
4082   gcc_assert (dt == vect_reduction_def
4083               || dt == vect_nested_cycle
4084               || ((dt == vect_internal_def || dt == vect_external_def
4085                    || dt == vect_constant_def || dt == vect_induction_def)
4086                    && nested_cycle && found_nested_cycle_def));
4087   if (!found_nested_cycle_def)
4088     reduc_def_stmt = def_stmt;
4089
4090   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4091   if (orig_stmt)
4092     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4093                                                        reduc_def_stmt,
4094                                                        !nested_cycle,
4095                                                        &dummy));
4096   else
4097     gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4098                                                   !nested_cycle, &dummy));
4099
4100   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4101     return false;
4102
4103   if (slp_node)
4104     ncopies = 1;
4105   else
4106     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4107                / TYPE_VECTOR_SUBPARTS (vectype_in));
4108
4109   gcc_assert (ncopies >= 1);
4110
4111   vec_mode = TYPE_MODE (vectype_in);
4112
4113   if (code == COND_EXPR)
4114     {
4115       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0))
4116         {
4117           if (vect_print_dump_info (REPORT_DETAILS))
4118             fprintf (vect_dump, "unsupported condition in reduction");
4119
4120             return false;
4121         }
4122     }
4123   else
4124     {
4125       /* 4. Supportable by target?  */
4126
4127       /* 4.1. check support for the operation in the loop  */
4128       optab = optab_for_tree_code (code, vectype_in, optab_default);
4129       if (!optab)
4130         {
4131           if (vect_print_dump_info (REPORT_DETAILS))
4132             fprintf (vect_dump, "no optab.");
4133
4134           return false;
4135         }
4136
4137       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4138         {
4139           if (vect_print_dump_info (REPORT_DETAILS))
4140             fprintf (vect_dump, "op not supported by target.");
4141
4142           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4143               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4144                   < vect_min_worthwhile_factor (code))
4145             return false;
4146
4147           if (vect_print_dump_info (REPORT_DETAILS))
4148             fprintf (vect_dump, "proceeding using word mode.");
4149         }
4150
4151       /* Worthwhile without SIMD support?  */
4152       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4153           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4154              < vect_min_worthwhile_factor (code))
4155         {
4156           if (vect_print_dump_info (REPORT_DETAILS))
4157             fprintf (vect_dump, "not worthwhile without SIMD support.");
4158
4159           return false;
4160         }
4161     }
4162
4163   /* 4.2. Check support for the epilog operation.
4164
4165           If STMT represents a reduction pattern, then the type of the
4166           reduction variable may be different than the type of the rest
4167           of the arguments.  For example, consider the case of accumulation
4168           of shorts into an int accumulator; The original code:
4169                         S1: int_a = (int) short_a;
4170           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
4171
4172           was replaced with:
4173                         STMT: int_acc = widen_sum <short_a, int_acc>
4174
4175           This means that:
4176           1. The tree-code that is used to create the vector operation in the
4177              epilog code (that reduces the partial results) is not the
4178              tree-code of STMT, but is rather the tree-code of the original
4179              stmt from the pattern that STMT is replacing.  I.e, in the example
4180              above we want to use 'widen_sum' in the loop, but 'plus' in the
4181              epilog.
4182           2. The type (mode) we use to check available target support
4183              for the vector operation to be created in the *epilog*, is
4184              determined by the type of the reduction variable (in the example
4185              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4186              However the type (mode) we use to check available target support
4187              for the vector operation to be created *inside the loop*, is
4188              determined by the type of the other arguments to STMT (in the
4189              example we'd check this: optab_handler (widen_sum_optab,
4190              vect_short_mode)).
4191
4192           This is contrary to "regular" reductions, in which the types of all
4193           the arguments are the same as the type of the reduction variable.
4194           For "regular" reductions we can therefore use the same vector type
4195           (and also the same tree-code) when generating the epilog code and
4196           when generating the code inside the loop.  */
4197
4198   if (orig_stmt)
4199     {
4200       /* This is a reduction pattern: get the vectype from the type of the
4201          reduction variable, and get the tree-code from orig_stmt.  */
4202       orig_code = gimple_assign_rhs_code (orig_stmt);
4203       gcc_assert (vectype_out);
4204       vec_mode = TYPE_MODE (vectype_out);
4205     }
4206   else
4207     {
4208       /* Regular reduction: use the same vectype and tree-code as used for
4209          the vector code inside the loop can be used for the epilog code. */
4210       orig_code = code;
4211     }
4212
4213   if (nested_cycle)
4214     {
4215       def_bb = gimple_bb (reduc_def_stmt);
4216       def_stmt_loop = def_bb->loop_father;
4217       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4218                                        loop_preheader_edge (def_stmt_loop));
4219       if (TREE_CODE (def_arg) == SSA_NAME
4220           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4221           && gimple_code (def_arg_stmt) == GIMPLE_PHI
4222           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4223           && vinfo_for_stmt (def_arg_stmt)
4224           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4225               == vect_double_reduction_def)
4226         double_reduc = true;
4227     }
4228
4229   epilog_reduc_code = ERROR_MARK;
4230   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4231     {
4232       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4233                                          optab_default);
4234       if (!reduc_optab)
4235         {
4236           if (vect_print_dump_info (REPORT_DETAILS))
4237             fprintf (vect_dump, "no optab for reduction.");
4238
4239           epilog_reduc_code = ERROR_MARK;
4240         }
4241
4242       if (reduc_optab
4243           && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4244         {
4245           if (vect_print_dump_info (REPORT_DETAILS))
4246             fprintf (vect_dump, "reduc op not supported by target.");
4247
4248           epilog_reduc_code = ERROR_MARK;
4249         }
4250     }
4251   else
4252     {
4253       if (!nested_cycle || double_reduc)
4254         {
4255           if (vect_print_dump_info (REPORT_DETAILS))
4256             fprintf (vect_dump, "no reduc code for scalar code.");
4257
4258           return false;
4259         }
4260     }
4261
4262   if (double_reduc && ncopies > 1)
4263     {
4264       if (vect_print_dump_info (REPORT_DETAILS))
4265         fprintf (vect_dump, "multiple types in double reduction");
4266
4267       return false;
4268     }
4269
4270   if (!vec_stmt) /* transformation not required.  */
4271     {
4272       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4273       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4274         return false;
4275       return true;
4276     }
4277
4278   /** Transform.  **/
4279
4280   if (vect_print_dump_info (REPORT_DETAILS))
4281     fprintf (vect_dump, "transform reduction.");
4282
4283   /* FORNOW: Multiple types are not supported for condition.  */
4284   if (code == COND_EXPR)
4285     gcc_assert (ncopies == 1);
4286
4287   /* Create the destination vector  */
4288   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4289
4290   /* In case the vectorization factor (VF) is bigger than the number
4291      of elements that we can fit in a vectype (nunits), we have to generate
4292      more than one vector stmt - i.e - we need to "unroll" the
4293      vector stmt by a factor VF/nunits.  For more details see documentation
4294      in vectorizable_operation.  */
4295
4296   /* If the reduction is used in an outer loop we need to generate
4297      VF intermediate results, like so (e.g. for ncopies=2):
4298         r0 = phi (init, r0)
4299         r1 = phi (init, r1)
4300         r0 = x0 + r0;
4301         r1 = x1 + r1;
4302     (i.e. we generate VF results in 2 registers).
4303     In this case we have a separate def-use cycle for each copy, and therefore
4304     for each copy we get the vector def for the reduction variable from the
4305     respective phi node created for this copy.
4306
4307     Otherwise (the reduction is unused in the loop nest), we can combine
4308     together intermediate results, like so (e.g. for ncopies=2):
4309         r = phi (init, r)
4310         r = x0 + r;
4311         r = x1 + r;
4312    (i.e. we generate VF/2 results in a single register).
4313    In this case for each copy we get the vector def for the reduction variable
4314    from the vectorized reduction operation generated in the previous iteration.
4315   */
4316
4317   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
4318     {
4319       single_defuse_cycle = true;
4320       epilog_copies = 1;
4321     }
4322   else
4323     epilog_copies = ncopies;
4324
4325   prev_stmt_info = NULL;
4326   prev_phi_info = NULL;
4327   if (slp_node)
4328     {
4329       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4330       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
4331                   == TYPE_VECTOR_SUBPARTS (vectype_in));
4332     }
4333   else
4334     {
4335       vec_num = 1;
4336       vec_oprnds0 = VEC_alloc (tree, heap, 1);
4337       if (op_type == ternary_op)
4338         vec_oprnds1 = VEC_alloc (tree, heap, 1);
4339     }
4340
4341   phis = VEC_alloc (gimple, heap, vec_num);
4342   vect_defs = VEC_alloc (tree, heap, vec_num);
4343   if (!slp_node)
4344     VEC_quick_push (tree, vect_defs, NULL_TREE);
4345
4346   for (j = 0; j < ncopies; j++)
4347     {
4348       if (j == 0 || !single_defuse_cycle)
4349         {
4350           for (i = 0; i < vec_num; i++)
4351             {
4352               /* Create the reduction-phi that defines the reduction
4353                  operand.  */
4354               new_phi = create_phi_node (vec_dest, loop->header);
4355               set_vinfo_for_stmt (new_phi,
4356                                   new_stmt_vec_info (new_phi, loop_vinfo,
4357                                                      NULL));
4358                if (j == 0 || slp_node)
4359                  VEC_quick_push (gimple, phis, new_phi);
4360             }
4361         }
4362
4363       if (code == COND_EXPR)
4364         {
4365           gcc_assert (!slp_node);
4366           vectorizable_condition (stmt, gsi, vec_stmt,
4367                                   PHI_RESULT (VEC_index (gimple, phis, 0)),
4368                                   reduc_index);
4369           /* Multiple types are not supported for condition.  */
4370           break;
4371         }
4372
4373       /* Handle uses.  */
4374       if (j == 0)
4375         {
4376           tree op0, op1 = NULL_TREE;
4377
4378           op0 = ops[!reduc_index];
4379           if (op_type == ternary_op)
4380             {
4381               if (reduc_index == 0)
4382                 op1 = ops[2];
4383               else
4384                 op1 = ops[1];
4385             }
4386
4387           if (slp_node)
4388             vect_get_slp_defs (op0, op1, slp_node, &vec_oprnds0, &vec_oprnds1,
4389                                -1);
4390           else
4391             {
4392               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
4393                                                             stmt, NULL);
4394               VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
4395               if (op_type == ternary_op)
4396                {
4397                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
4398                                                                NULL);
4399                  VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
4400                }
4401             }
4402         }
4403       else
4404         {
4405           if (!slp_node)
4406             {
4407               enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
4408               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
4409               VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
4410               if (op_type == ternary_op)
4411                 {
4412                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
4413                                                                 loop_vec_def1);
4414                   VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
4415                 }
4416             }
4417
4418           if (single_defuse_cycle)
4419             reduc_def = gimple_assign_lhs (new_stmt);
4420
4421           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
4422         }
4423
4424       FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, def0)
4425         {
4426           if (slp_node)
4427             reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
4428           else
4429             {
4430               if (!single_defuse_cycle || j == 0)
4431                 reduc_def = PHI_RESULT (new_phi);
4432             }
4433
4434           def1 = ((op_type == ternary_op)
4435                   ? VEC_index (tree, vec_oprnds1, i) : NULL);
4436           if (op_type == binary_op)
4437             {
4438               if (reduc_index == 0)
4439                 expr = build2 (code, vectype_out, reduc_def, def0);
4440               else
4441                 expr = build2 (code, vectype_out, def0, reduc_def);
4442             }
4443           else
4444             {
4445               if (reduc_index == 0)
4446                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
4447               else
4448                 {
4449                   if (reduc_index == 1)
4450                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
4451                   else
4452                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
4453                 }
4454             }
4455
4456           new_stmt = gimple_build_assign (vec_dest, expr);
4457           new_temp = make_ssa_name (vec_dest, new_stmt);
4458           gimple_assign_set_lhs (new_stmt, new_temp);
4459           vect_finish_stmt_generation (stmt, new_stmt, gsi);
4460           if (slp_node)
4461             {
4462               VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4463               VEC_quick_push (tree, vect_defs, new_temp);
4464             }
4465           else
4466             VEC_replace (tree, vect_defs, 0, new_temp);
4467         }
4468
4469       if (slp_node)
4470         continue;
4471
4472       if (j == 0)
4473         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4474       else
4475         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4476
4477       prev_stmt_info = vinfo_for_stmt (new_stmt);
4478       prev_phi_info = vinfo_for_stmt (new_phi);
4479     }
4480
4481   /* Finalize the reduction-phi (set its arguments) and create the
4482      epilog reduction code.  */
4483   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
4484     {
4485       new_temp = gimple_assign_lhs (*vec_stmt);
4486       VEC_replace (tree, vect_defs, 0, new_temp);
4487     }
4488
4489   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
4490                                     epilog_reduc_code, phis, reduc_index,
4491                                     double_reduc, slp_node);
4492
4493   VEC_free (gimple, heap, phis);
4494   VEC_free (tree, heap, vec_oprnds0);
4495   if (vec_oprnds1)
4496     VEC_free (tree, heap, vec_oprnds1);
4497
4498   return true;
4499 }
4500
4501 /* Function vect_min_worthwhile_factor.
4502
4503    For a loop where we could vectorize the operation indicated by CODE,
4504    return the minimum vectorization factor that makes it worthwhile
4505    to use generic vectors.  */
4506 int
4507 vect_min_worthwhile_factor (enum tree_code code)
4508 {
4509   switch (code)
4510     {
4511     case PLUS_EXPR:
4512     case MINUS_EXPR:
4513     case NEGATE_EXPR:
4514       return 4;
4515
4516     case BIT_AND_EXPR:
4517     case BIT_IOR_EXPR:
4518     case BIT_XOR_EXPR:
4519     case BIT_NOT_EXPR:
4520       return 2;
4521
4522     default:
4523       return INT_MAX;
4524     }
4525 }
4526
4527
4528 /* Function vectorizable_induction
4529
4530    Check if PHI performs an induction computation that can be vectorized.
4531    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
4532    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
4533    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
4534
4535 bool
4536 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4537                         gimple *vec_stmt)
4538 {
4539   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
4540   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4541   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4542   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4543   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4544   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4545   tree vec_def;
4546
4547   gcc_assert (ncopies >= 1);
4548   /* FORNOW. This restriction should be relaxed.  */
4549   if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
4550     {
4551       if (vect_print_dump_info (REPORT_DETAILS))
4552         fprintf (vect_dump, "multiple types in nested loop.");
4553       return false;
4554     }
4555
4556   if (!STMT_VINFO_RELEVANT_P (stmt_info))
4557     return false;
4558
4559   /* FORNOW: SLP not supported.  */
4560   if (STMT_SLP_TYPE (stmt_info))
4561     return false;
4562
4563   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
4564
4565   if (gimple_code (phi) != GIMPLE_PHI)
4566     return false;
4567
4568   if (!vec_stmt) /* transformation not required.  */
4569     {
4570       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
4571       if (vect_print_dump_info (REPORT_DETAILS))
4572         fprintf (vect_dump, "=== vectorizable_induction ===");
4573       vect_model_induction_cost (stmt_info, ncopies);
4574       return true;
4575     }
4576
4577   /** Transform.  **/
4578
4579   if (vect_print_dump_info (REPORT_DETAILS))
4580     fprintf (vect_dump, "transform induction phi.");
4581
4582   vec_def = get_initial_def_for_induction (phi);
4583   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
4584   return true;
4585 }
4586
4587 /* Function vectorizable_live_operation.
4588
4589    STMT computes a value that is used outside the loop.  Check if
4590    it can be supported.  */
4591
4592 bool
4593 vectorizable_live_operation (gimple stmt,
4594                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4595                              gimple *vec_stmt ATTRIBUTE_UNUSED)
4596 {
4597   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4598   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4599   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4600   int i;
4601   int op_type;
4602   tree op;
4603   tree def;
4604   gimple def_stmt;
4605   enum vect_def_type dt;
4606   enum tree_code code;
4607   enum gimple_rhs_class rhs_class;
4608
4609   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
4610
4611   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
4612     return false;
4613
4614   if (!is_gimple_assign (stmt))
4615     return false;
4616
4617   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4618     return false;
4619
4620   /* FORNOW. CHECKME. */
4621   if (nested_in_vect_loop_p (loop, stmt))
4622     return false;
4623
4624   code = gimple_assign_rhs_code (stmt);
4625   op_type = TREE_CODE_LENGTH (code);
4626   rhs_class = get_gimple_rhs_class (code);
4627   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
4628   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
4629
4630   /* FORNOW: support only if all uses are invariant.  This means
4631      that the scalar operations can remain in place, unvectorized.
4632      The original last scalar value that they compute will be used.  */
4633
4634   for (i = 0; i < op_type; i++)
4635     {
4636       if (rhs_class == GIMPLE_SINGLE_RHS)
4637         op = TREE_OPERAND (gimple_op (stmt, 1), i);
4638       else
4639         op = gimple_op (stmt, i + 1);
4640       if (op
4641           && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
4642         {
4643           if (vect_print_dump_info (REPORT_DETAILS))
4644             fprintf (vect_dump, "use not simple.");
4645           return false;
4646         }
4647
4648       if (dt != vect_external_def && dt != vect_constant_def)
4649         return false;
4650     }
4651
4652   /* No transformation is required for the cases we currently support.  */
4653   return true;
4654 }
4655
4656 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
4657
4658 static void
4659 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
4660 {
4661   ssa_op_iter op_iter;
4662   imm_use_iterator imm_iter;
4663   def_operand_p def_p;
4664   gimple ustmt;
4665
4666   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
4667     {
4668       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
4669         {
4670           basic_block bb;
4671
4672           if (!is_gimple_debug (ustmt))
4673             continue;
4674
4675           bb = gimple_bb (ustmt);
4676
4677           if (!flow_bb_inside_loop_p (loop, bb))
4678             {
4679               if (gimple_debug_bind_p (ustmt))
4680                 {
4681                   if (vect_print_dump_info (REPORT_DETAILS))
4682                     fprintf (vect_dump, "killing debug use");
4683
4684                   gimple_debug_bind_reset_value (ustmt);
4685                   update_stmt (ustmt);
4686                 }
4687               else
4688                 gcc_unreachable ();
4689             }
4690         }
4691     }
4692 }
4693
4694 /* Function vect_transform_loop.
4695
4696    The analysis phase has determined that the loop is vectorizable.
4697    Vectorize the loop - created vectorized stmts to replace the scalar
4698    stmts in the loop, and update the loop exit condition.  */
4699
4700 void
4701 vect_transform_loop (loop_vec_info loop_vinfo)
4702 {
4703   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4704   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4705   int nbbs = loop->num_nodes;
4706   gimple_stmt_iterator si;
4707   int i;
4708   tree ratio = NULL;
4709   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4710   bool strided_store;
4711   bool slp_scheduled = false;
4712   unsigned int nunits;
4713   tree cond_expr = NULL_TREE;
4714   gimple_seq cond_expr_stmt_list = NULL;
4715   bool do_peeling_for_loop_bound;
4716
4717   if (vect_print_dump_info (REPORT_DETAILS))
4718     fprintf (vect_dump, "=== vec_transform_loop ===");
4719
4720   /* Peel the loop if there are data refs with unknown alignment.
4721      Only one data ref with unknown store is allowed.  */
4722
4723   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4724     vect_do_peeling_for_alignment (loop_vinfo);
4725
4726   do_peeling_for_loop_bound
4727     = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4728        || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4729            && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
4730        || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
4731
4732   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
4733       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4734     vect_loop_versioning (loop_vinfo,
4735                           !do_peeling_for_loop_bound,
4736                           &cond_expr, &cond_expr_stmt_list);
4737
4738   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4739      compile time constant), or it is a constant that doesn't divide by the
4740      vectorization factor, then an epilog loop needs to be created.
4741      We therefore duplicate the loop: the original loop will be vectorized,
4742      and will compute the first (n/VF) iterations.  The second copy of the loop
4743      will remain scalar and will compute the remaining (n%VF) iterations.
4744      (VF is the vectorization factor).  */
4745
4746   if (do_peeling_for_loop_bound)
4747     vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
4748                                     cond_expr, cond_expr_stmt_list);
4749   else
4750     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
4751                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
4752
4753   /* 1) Make sure the loop header has exactly two entries
4754      2) Make sure we have a preheader basic block.  */
4755
4756   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
4757
4758   split_edge (loop_preheader_edge (loop));
4759
4760   /* FORNOW: the vectorizer supports only loops which body consist
4761      of one basic block (header + empty latch). When the vectorizer will
4762      support more involved loop forms, the order by which the BBs are
4763      traversed need to be reconsidered.  */
4764
4765   for (i = 0; i < nbbs; i++)
4766     {
4767       basic_block bb = bbs[i];
4768       stmt_vec_info stmt_info;
4769       gimple phi;
4770
4771       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
4772         {
4773           phi = gsi_stmt (si);
4774           if (vect_print_dump_info (REPORT_DETAILS))
4775             {
4776               fprintf (vect_dump, "------>vectorizing phi: ");
4777               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
4778             }
4779           stmt_info = vinfo_for_stmt (phi);
4780           if (!stmt_info)
4781             continue;
4782
4783           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4784             vect_loop_kill_debug_uses (loop, phi);
4785
4786           if (!STMT_VINFO_RELEVANT_P (stmt_info)
4787               && !STMT_VINFO_LIVE_P (stmt_info))
4788             continue;
4789
4790           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
4791                 != (unsigned HOST_WIDE_INT) vectorization_factor)
4792               && vect_print_dump_info (REPORT_DETAILS))
4793             fprintf (vect_dump, "multiple-types.");
4794
4795           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
4796             {
4797               if (vect_print_dump_info (REPORT_DETAILS))
4798                 fprintf (vect_dump, "transform phi.");
4799               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
4800             }
4801         }
4802
4803       for (si = gsi_start_bb (bb); !gsi_end_p (si);)
4804         {
4805           gimple stmt = gsi_stmt (si);
4806           bool is_store;
4807
4808           if (vect_print_dump_info (REPORT_DETAILS))
4809             {
4810               fprintf (vect_dump, "------>vectorizing statement: ");
4811               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
4812             }
4813
4814           stmt_info = vinfo_for_stmt (stmt);
4815
4816           /* vector stmts created in the outer-loop during vectorization of
4817              stmts in an inner-loop may not have a stmt_info, and do not
4818              need to be vectorized.  */
4819           if (!stmt_info)
4820             {
4821               gsi_next (&si);
4822               continue;
4823             }
4824
4825           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4826             vect_loop_kill_debug_uses (loop, stmt);
4827
4828           if (!STMT_VINFO_RELEVANT_P (stmt_info)
4829               && !STMT_VINFO_LIVE_P (stmt_info))
4830             {
4831               gsi_next (&si);
4832               continue;
4833             }
4834
4835           gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
4836           nunits =
4837             (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4838           if (!STMT_SLP_TYPE (stmt_info)
4839               && nunits != (unsigned int) vectorization_factor
4840               && vect_print_dump_info (REPORT_DETAILS))
4841             /* For SLP VF is set according to unrolling factor, and not to
4842                vector size, hence for SLP this print is not valid.  */
4843             fprintf (vect_dump, "multiple-types.");
4844
4845           /* SLP. Schedule all the SLP instances when the first SLP stmt is
4846              reached.  */
4847           if (STMT_SLP_TYPE (stmt_info))
4848             {
4849               if (!slp_scheduled)
4850                 {
4851                   slp_scheduled = true;
4852
4853                   if (vect_print_dump_info (REPORT_DETAILS))
4854                     fprintf (vect_dump, "=== scheduling SLP instances ===");
4855
4856                   vect_schedule_slp (loop_vinfo, NULL);
4857                 }
4858
4859               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
4860               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
4861                 {
4862                   gsi_next (&si);
4863                   continue;
4864                 }
4865             }
4866
4867           /* -------- vectorize statement ------------ */
4868           if (vect_print_dump_info (REPORT_DETAILS))
4869             fprintf (vect_dump, "transform statement.");
4870
4871           strided_store = false;
4872           is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
4873           if (is_store)
4874             {
4875               if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4876                 {
4877                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
4878                      interleaving chain was completed - free all the stores in
4879                      the chain.  */
4880                   vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
4881                   gsi_remove (&si, true);
4882                   continue;
4883                 }
4884               else
4885                 {
4886                   /* Free the attached stmt_vec_info and remove the stmt.  */
4887                   free_stmt_vec_info (stmt);
4888                   gsi_remove (&si, true);
4889                   continue;
4890                 }
4891             }
4892           gsi_next (&si);
4893         }                       /* stmts in BB */
4894     }                           /* BBs in loop */
4895
4896   slpeel_make_loop_iterate_ntimes (loop, ratio);
4897
4898   /* The memory tags and pointers in vectorized statements need to
4899      have their SSA forms updated.  FIXME, why can't this be delayed
4900      until all the loops have been transformed?  */
4901   update_ssa (TODO_update_ssa);
4902
4903   if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4904     fprintf (vect_dump, "LOOP VECTORIZED.");
4905   if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4906     fprintf (vect_dump, "OUTER LOOP VECTORIZED.");
4907 }