gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
   3    Free Software Foundation, Inc.
   4    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   5    Ira Rosen <irar@il.ibm.com>
   6
   7 This file is part of GCC.
   8
   9 GCC is free software; you can redistribute it and/or modify it under
  10 the terms of the GNU General Public License as published by the Free
  11 Software Foundation; either version 3, or (at your option) any later
  12 version.
  13
  14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  16 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  17 for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with GCC; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "tm.h"
  27 #include "ggc.h"
  28 #include "tree.h"
  29 #include "basic-block.h"
  30 #include "tree-pretty-print.h"
  31 #include "gimple-pretty-print.h"
  32 #include "tree-flow.h"
  33 #include "tree-dump.h"
  34 #include "cfgloop.h"
  35 #include "cfglayout.h"
  36 #include "expr.h"
  37 #include "recog.h"
  38 #include "optabs.h"
  39 #include "params.h"
  40 #include "diagnostic-core.h"
  41 #include "tree-chrec.h"
  42 #include "tree-scalar-evolution.h"
  43 #include "tree-vectorizer.h"
  44 #include "target.h"
  45
  46 /* Loop Vectorization Pass.
  47
  48    This pass tries to vectorize loops.
  49
  50    For example, the vectorizer transforms the following simple loop:
  51
  52         short a[N]; short b[N]; short c[N]; int i;
  53
  54         for (i=0; i<N; i++){
  55           a[i] = b[i] + c[i];
  56         }
  57
  58    as if it was manually vectorized by rewriting the source code into:
  59
  60         typedef int __attribute__((mode(V8HI))) v8hi;
  61         short a[N];  short b[N]; short c[N];   int i;
  62         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  63         v8hi va, vb, vc;
  64
  65         for (i=0; i<N/8; i++){
  66           vb = pb[i];
  67           vc = pc[i];
  68           va = vb + vc;
  69           pa[i] = va;
  70         }
  71
  72         The main entry to this pass is vectorize_loops(), in which
  73    the vectorizer applies a set of analyses on a given set of loops,
  74    followed by the actual vectorization transformation for the loops that
  75    had successfully passed the analysis phase.
  76         Throughout this pass we make a distinction between two types of
  77    data: scalars (which are represented by SSA_NAMES), and memory references
  78    ("data-refs").  These two types of data require different handling both
  79    during analysis and transformation. The types of data-refs that the
  80    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  81    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  82    accesses are required to have a simple (consecutive) access pattern.
  83
  84    Analysis phase:
  85    ===============
  86         The driver for the analysis phase is vect_analyze_loop().
  87    It applies a set of analyses, some of which rely on the scalar evolution
  88    analyzer (scev) developed by Sebastian Pop.
  89
  90         During the analysis phase the vectorizer records some information
  91    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  92    loop, as well as general information about the loop as a whole, which is
  93    recorded in a "loop_vec_info" struct attached to each loop.
  94
  95    Transformation phase:
  96    =====================
  97         The loop transformation phase scans all the stmts in the loop, and
  98    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
  99    the loop that needs to be vectorized.  It inserts the vector code sequence
 100    just before the scalar stmt S, and records a pointer to the vector code
 101    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 102    attached to S).  This pointer will be used for the vectorization of following
 103    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 104    otherwise, we rely on dead code elimination for removing it.
 105
 106         For example, say stmt S1 was vectorized into stmt VS1:
 107
 108    VS1: vb = px[i];
 109    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 110    S2:  a = b;
 111
 112    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 113    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 114    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 115    resulting sequence would be:
 116
 117    VS1: vb = px[i];
 118    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 119    VS2: va = vb;
 120    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 121
 122         Operands that are not SSA_NAMEs, are data-refs that appear in
 123    load/store operations (like 'x[i]' in S1), and are handled differently.
 124
 125    Target modeling:
 126    =================
 127         Currently the only target specific information that is used is the
 128    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 129    Targets that can support different sizes of vectors, for now will need
 130    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 131    flexibility will be added in the future.
 132
 133         Since we only vectorize operations which vector form can be
 134    expressed using existing tree codes, to verify that an operation is
 135    supported, the vectorizer checks the relevant optab at the relevant
 136    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 137    the value found is CODE_FOR_nothing, then there's no target support, and
 138    we can't vectorize the stmt.
 139
 140    For additional information on this project see:
 141    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 142 */
 143
 144 /* Function vect_determine_vectorization_factor
 145
 146    Determine the vectorization factor (VF).  VF is the number of data elements
 147    that are operated upon in parallel in a single iteration of the vectorized
 148    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 149    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 150    elements can fit in a single vector register.
 151
 152    We currently support vectorization of loops in which all types operated upon
 153    are of the same size.  Therefore this function currently sets VF according to
 154    the size of the types operated upon, and fails if there are multiple sizes
 155    in the loop.
 156
 157    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 158    original loop:
 159         for (i=0; i<N; i++){
 160           a[i] = b[i] + c[i];
 161         }
 162
 163    vectorized loop:
 164         for (i=0; i<N; i+=VF){
 165           a[i:VF] = b[i:VF] + c[i:VF];
 166         }
 167 */
 168
 169 static bool
 170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 171 {
 172   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 173   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 174   int nbbs = loop->num_nodes;
 175   gimple_stmt_iterator si;
 176   unsigned int vectorization_factor = 0;
 177   tree scalar_type;
 178   gimple phi;
 179   tree vectype;
 180   unsigned int nunits;
 181   stmt_vec_info stmt_info;
 182   int i;
 183   HOST_WIDE_INT dummy;
 184   gimple stmt, pattern_stmt = NULL, pattern_def_stmt = NULL;
 185   bool analyze_pattern_stmt = false, pattern_def = false;
 186
 187   if (vect_print_dump_info (REPORT_DETAILS))
 188     fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
 189
 190   for (i = 0; i < nbbs; i++)
 191     {
 192       basic_block bb = bbs[i];
 193
 194       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 195         {
 196           phi = gsi_stmt (si);
 197           stmt_info = vinfo_for_stmt (phi);
 198           if (vect_print_dump_info (REPORT_DETAILS))
 199             {
 200               fprintf (vect_dump, "==> examining phi: ");
 201               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 202             }
 203
 204           gcc_assert (stmt_info);
 205
 206           if (STMT_VINFO_RELEVANT_P (stmt_info))
 207             {
 208               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 209               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 210
 211               if (vect_print_dump_info (REPORT_DETAILS))
 212                 {
 213                   fprintf (vect_dump, "get vectype for scalar type:  ");
 214                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 215                 }
 216
 217               vectype = get_vectype_for_scalar_type (scalar_type);
 218               if (!vectype)
 219                 {
 220                   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 221                     {
 222                       fprintf (vect_dump,
 223                                "not vectorized: unsupported data-type ");
 224                       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 225                     }
 226                   return false;
 227                 }
 228               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 229
 230               if (vect_print_dump_info (REPORT_DETAILS))
 231                 {
 232                   fprintf (vect_dump, "vectype: ");
 233                   print_generic_expr (vect_dump, vectype, TDF_SLIM);
 234                 }
 235
 236               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 237               if (vect_print_dump_info (REPORT_DETAILS))
 238                 fprintf (vect_dump, "nunits = %d", nunits);
 239
 240               if (!vectorization_factor
 241                   || (nunits > vectorization_factor))
 242                 vectorization_factor = nunits;
 243             }
 244         }
 245
 246       for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
 247         {
 248           tree vf_vectype;
 249
 250           if (analyze_pattern_stmt)
 251             {
 252               stmt = pattern_stmt;
 253               analyze_pattern_stmt = false;
 254             }
 255           else
 256             stmt = gsi_stmt (si);
 257
 258           stmt_info = vinfo_for_stmt (stmt);
 259
 260           if (vect_print_dump_info (REPORT_DETAILS))
 261             {
 262               fprintf (vect_dump, "==> examining statement: ");
 263               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 264             }
 265
 266           gcc_assert (stmt_info);
 267
 268           /* Skip stmts which do not need to be vectorized.  */
 269           if (!STMT_VINFO_RELEVANT_P (stmt_info)
 270               && !STMT_VINFO_LIVE_P (stmt_info))
 271             {
 272               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 273                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 274                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 275                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 276                 {
 277                   stmt = pattern_stmt;
 278                   stmt_info = vinfo_for_stmt (pattern_stmt);
 279                   if (vect_print_dump_info (REPORT_DETAILS))
 280                     {
 281                       fprintf (vect_dump, "==> examining pattern statement: ");
 282                       print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 283                     }
 284                 }
 285               else
 286                 {
 287                   if (vect_print_dump_info (REPORT_DETAILS))
 288                     fprintf (vect_dump, "skip.");
 289                   gsi_next (&si);
 290                   continue;
 291                 }
 292             }
 293           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 294                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 295                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 296                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 297             analyze_pattern_stmt = true;
 298
 299           /* If a pattern statement has a def stmt, analyze it too.  */
 300           if (is_pattern_stmt_p (stmt_info)
 301               && (pattern_def_stmt = STMT_VINFO_PATTERN_DEF_STMT (stmt_info))
 302               && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_def_stmt))
 303                   || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_def_stmt))))
 304             {
 305               if (pattern_def)
 306                 pattern_def = false;
 307               else
 308                 {
 309                   if (vect_print_dump_info (REPORT_DETAILS))
 310                     {
 311                       fprintf (vect_dump, "==> examining pattern def stmt: ");
 312                       print_gimple_stmt (vect_dump, pattern_def_stmt, 0,
 313                                          TDF_SLIM);
 314                     }
 315
 316                   pattern_def = true;
 317                   stmt = pattern_def_stmt;
 318                   stmt_info = vinfo_for_stmt (stmt);
 319                 }
 320             }
 321
 322           if (gimple_get_lhs (stmt) == NULL_TREE)
 323             {
 324               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 325                 {
 326                   fprintf (vect_dump, "not vectorized: irregular stmt.");
 327                   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 328                 }
 329               return false;
 330             }
 331
 332           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 333             {
 334               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 335                 {
 336                   fprintf (vect_dump, "not vectorized: vector stmt in loop:");
 337                   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 338                 }
 339               return false;
 340             }
 341
 342           if (STMT_VINFO_VECTYPE (stmt_info))
 343             {
 344               /* The only case when a vectype had been already set is for stmts
 345                  that contain a dataref, or for "pattern-stmts" (stmts generated
 346                  by the vectorizer to represent/replace a certain idiom).  */
 347               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 348                           || is_pattern_stmt_p (stmt_info));
 349               vectype = STMT_VINFO_VECTYPE (stmt_info);
 350             }
 351           else
 352             {
 353               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 354               scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 355               if (vect_print_dump_info (REPORT_DETAILS))
 356                 {
 357                   fprintf (vect_dump, "get vectype for scalar type:  ");
 358                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 359                 }
 360               vectype = get_vectype_for_scalar_type (scalar_type);
 361               if (!vectype)
 362                 {
 363                   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 364                     {
 365                       fprintf (vect_dump,
 366                                "not vectorized: unsupported data-type ");
 367                       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 368                     }
 369                   return false;
 370                 }
 371
 372               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 373             }
 374
 375           /* The vectorization factor is according to the smallest
 376              scalar type (or the largest vector size, but we only
 377              support one vector size per loop).  */
 378           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 379                                                        &dummy);
 380           if (vect_print_dump_info (REPORT_DETAILS))
 381             {
 382               fprintf (vect_dump, "get vectype for scalar type:  ");
 383               print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 384             }
 385           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 386           if (!vf_vectype)
 387             {
 388               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 389                 {
 390                   fprintf (vect_dump,
 391                            "not vectorized: unsupported data-type ");
 392                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 393                 }
 394               return false;
 395             }
 396
 397           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 398                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 399             {
 400               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 401                 {
 402                   fprintf (vect_dump,
 403                            "not vectorized: different sized vector "
 404                            "types in statement, ");
 405                   print_generic_expr (vect_dump, vectype, TDF_SLIM);
 406                   fprintf (vect_dump, " and ");
 407                   print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
 408                 }
 409               return false;
 410             }
 411
 412           if (vect_print_dump_info (REPORT_DETAILS))
 413             {
 414               fprintf (vect_dump, "vectype: ");
 415               print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
 416             }
 417
 418           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 419           if (vect_print_dump_info (REPORT_DETAILS))
 420             fprintf (vect_dump, "nunits = %d", nunits);
 421
 422           if (!vectorization_factor
 423               || (nunits > vectorization_factor))
 424             vectorization_factor = nunits;
 425
 426           if (!analyze_pattern_stmt && !pattern_def)
 427             gsi_next (&si);
 428         }
 429     }
 430
 431   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 432   if (vect_print_dump_info (REPORT_DETAILS))
 433     fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
 434   if (vectorization_factor <= 1)
 435     {
 436       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 437         fprintf (vect_dump, "not vectorized: unsupported data-type");
 438       return false;
 439     }
 440   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 441
 442   return true;
 443 }
 444
 445
 446 /* Function vect_is_simple_iv_evolution.
 447
 448    FORNOW: A simple evolution of an induction variables in the loop is
 449    considered a polynomial evolution with constant step.  */
 450
 451 static bool
 452 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 453                              tree * step)
 454 {
 455   tree init_expr;
 456   tree step_expr;
 457   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 458
 459   /* When there is no evolution in this loop, the evolution function
 460      is not "simple".  */
 461   if (evolution_part == NULL_TREE)
 462     return false;
 463
 464   /* When the evolution is a polynomial of degree >= 2
 465      the evolution function is not "simple".  */
 466   if (tree_is_chrec (evolution_part))
 467     return false;
 468
 469   step_expr = evolution_part;
 470   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 471
 472   if (vect_print_dump_info (REPORT_DETAILS))
 473     {
 474       fprintf (vect_dump, "step: ");
 475       print_generic_expr (vect_dump, step_expr, TDF_SLIM);
 476       fprintf (vect_dump, ",  init: ");
 477       print_generic_expr (vect_dump, init_expr, TDF_SLIM);
 478     }
 479
 480   *init = init_expr;
 481   *step = step_expr;
 482
 483   if (TREE_CODE (step_expr) != INTEGER_CST)
 484     {
 485       if (vect_print_dump_info (REPORT_DETAILS))
 486         fprintf (vect_dump, "step unknown.");
 487       return false;
 488     }
 489
 490   return true;
 491 }
 492
 493 /* Function vect_analyze_scalar_cycles_1.
 494
 495    Examine the cross iteration def-use cycles of scalar variables
 496    in LOOP.  LOOP_VINFO represents the loop that is now being
 497    considered for vectorization (can be LOOP, or an outer-loop
 498    enclosing LOOP).  */
 499
 500 static void
 501 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 502 {
 503   basic_block bb = loop->header;
 504   tree dumy;
 505   VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
 506   gimple_stmt_iterator gsi;
 507   bool double_reduc;
 508
 509   if (vect_print_dump_info (REPORT_DETAILS))
 510     fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
 511
 512   /* First - identify all inductions.  Reduction detection assumes that all the
 513      inductions have been identified, therefore, this order must not be
 514      changed.  */
 515   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 516     {
 517       gimple phi = gsi_stmt (gsi);
 518       tree access_fn = NULL;
 519       tree def = PHI_RESULT (phi);
 520       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 521
 522       if (vect_print_dump_info (REPORT_DETAILS))
 523         {
 524           fprintf (vect_dump, "Analyze phi: ");
 525           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 526         }
 527
 528       /* Skip virtual phi's.  The data dependences that are associated with
 529          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 530       if (!is_gimple_reg (SSA_NAME_VAR (def)))
 531         continue;
 532
 533       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 534
 535       /* Analyze the evolution function.  */
 536       access_fn = analyze_scalar_evolution (loop, def);
 537       if (access_fn)
 538         STRIP_NOPS (access_fn);
 539       if (access_fn && vect_print_dump_info (REPORT_DETAILS))
 540         {
 541           fprintf (vect_dump, "Access function of PHI: ");
 542           print_generic_expr (vect_dump, access_fn, TDF_SLIM);
 543         }
 544
 545       if (!access_fn
 546           || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
 547         {
 548           VEC_safe_push (gimple, heap, worklist, phi);
 549           continue;
 550         }
 551
 552       if (vect_print_dump_info (REPORT_DETAILS))
 553         fprintf (vect_dump, "Detected induction.");
 554       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 555     }
 556
 557
 558   /* Second - identify all reductions and nested cycles.  */
 559   while (VEC_length (gimple, worklist) > 0)
 560     {
 561       gimple phi = VEC_pop (gimple, worklist);
 562       tree def = PHI_RESULT (phi);
 563       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 564       gimple reduc_stmt;
 565       bool nested_cycle;
 566
 567       if (vect_print_dump_info (REPORT_DETAILS))
 568         {
 569           fprintf (vect_dump, "Analyze phi: ");
 570           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 571         }
 572
 573       gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
 574       gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 575
 576       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 577       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 578                                                 &double_reduc);
 579       if (reduc_stmt)
 580         {
 581           if (double_reduc)
 582             {
 583               if (vect_print_dump_info (REPORT_DETAILS))
 584                 fprintf (vect_dump, "Detected double reduction.");
 585
 586               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 587               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 588                                                     vect_double_reduction_def;
 589             }
 590           else
 591             {
 592               if (nested_cycle)
 593                 {
 594                   if (vect_print_dump_info (REPORT_DETAILS))
 595                     fprintf (vect_dump, "Detected vectorizable nested cycle.");
 596
 597                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 598                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 599                                                              vect_nested_cycle;
 600                 }
 601               else
 602                 {
 603                   if (vect_print_dump_info (REPORT_DETAILS))
 604                     fprintf (vect_dump, "Detected reduction.");
 605
 606                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 607                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 608                                                            vect_reduction_def;
 609                   /* Store the reduction cycles for possible vectorization in
 610                      loop-aware SLP.  */
 611                   VEC_safe_push (gimple, heap,
 612                                  LOOP_VINFO_REDUCTIONS (loop_vinfo),
 613                                  reduc_stmt);
 614                 }
 615             }
 616         }
 617       else
 618         if (vect_print_dump_info (REPORT_DETAILS))
 619           fprintf (vect_dump, "Unknown def-use cycle pattern.");
 620     }
 621
 622   VEC_free (gimple, heap, worklist);
 623 }
 624
 625
 626 /* Function vect_analyze_scalar_cycles.
 627
 628    Examine the cross iteration def-use cycles of scalar variables, by
 629    analyzing the loop-header PHIs of scalar variables.  Classify each
 630    cycle as one of the following: invariant, induction, reduction, unknown.
 631    We do that for the loop represented by LOOP_VINFO, and also to its
 632    inner-loop, if exists.
 633    Examples for scalar cycles:
 634
 635    Example1: reduction:
 636
 637               loop1:
 638               for (i=0; i<N; i++)
 639                  sum += a[i];
 640
 641    Example2: induction:
 642
 643               loop2:
 644               for (i=0; i<N; i++)
 645                  a[i] = i;  */
 646
 647 static void
 648 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 649 {
 650   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 651
 652   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 653
 654   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 655      Reductions in such inner-loop therefore have different properties than
 656      the reductions in the nest that gets vectorized:
 657      1. When vectorized, they are executed in the same order as in the original
 658         scalar loop, so we can't change the order of computation when
 659         vectorizing them.
 660      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 661         current checks are too strict.  */
 662
 663   if (loop->inner)
 664     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 665 }
 666
 667 /* Function vect_get_loop_niters.
 668
 669    Determine how many iterations the loop is executed.
 670    If an expression that represents the number of iterations
 671    can be constructed, place it in NUMBER_OF_ITERATIONS.
 672    Return the loop exit condition.  */
 673
 674 static gimple
 675 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 676 {
 677   tree niters;
 678
 679   if (vect_print_dump_info (REPORT_DETAILS))
 680     fprintf (vect_dump, "=== get_loop_niters ===");
 681
 682   niters = number_of_exit_cond_executions (loop);
 683
 684   if (niters != NULL_TREE
 685       && niters != chrec_dont_know)
 686     {
 687       *number_of_iterations = niters;
 688
 689       if (vect_print_dump_info (REPORT_DETAILS))
 690         {
 691           fprintf (vect_dump, "==> get_loop_niters:" );
 692           print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
 693         }
 694     }
 695
 696   return get_loop_exit_condition (loop);
 697 }
 698
 699
 700 /* Function bb_in_loop_p
 701
 702    Used as predicate for dfs order traversal of the loop bbs.  */
 703
 704 static bool
 705 bb_in_loop_p (const_basic_block bb, const void *data)
 706 {
 707   const struct loop *const loop = (const struct loop *)data;
 708   if (flow_bb_inside_loop_p (loop, bb))
 709     return true;
 710   return false;
 711 }
 712
 713
 714 /* Function new_loop_vec_info.
 715
 716    Create and initialize a new loop_vec_info struct for LOOP, as well as
 717    stmt_vec_info structs for all the stmts in LOOP.  */
 718
 719 static loop_vec_info
 720 new_loop_vec_info (struct loop *loop)
 721 {
 722   loop_vec_info res;
 723   basic_block *bbs;
 724   gimple_stmt_iterator si;
 725   unsigned int i, nbbs;
 726
 727   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 728   LOOP_VINFO_LOOP (res) = loop;
 729
 730   bbs = get_loop_body (loop);
 731
 732   /* Create/Update stmt_info for all stmts in the loop.  */
 733   for (i = 0; i < loop->num_nodes; i++)
 734     {
 735       basic_block bb = bbs[i];
 736
 737       /* BBs in a nested inner-loop will have been already processed (because
 738          we will have called vect_analyze_loop_form for any nested inner-loop).
 739          Therefore, for stmts in an inner-loop we just want to update the
 740          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 741          loop_info of the outer-loop we are currently considering to vectorize
 742          (instead of the loop_info of the inner-loop).
 743          For stmts in other BBs we need to create a stmt_info from scratch.  */
 744       if (bb->loop_father != loop)
 745         {
 746           /* Inner-loop bb.  */
 747           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 748           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 749             {
 750               gimple phi = gsi_stmt (si);
 751               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 752               loop_vec_info inner_loop_vinfo =
 753                 STMT_VINFO_LOOP_VINFO (stmt_info);
 754               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 755               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 756             }
 757           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 758            {
 759               gimple stmt = gsi_stmt (si);
 760               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 761               loop_vec_info inner_loop_vinfo =
 762                  STMT_VINFO_LOOP_VINFO (stmt_info);
 763               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 764               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 765            }
 766         }
 767       else
 768         {
 769           /* bb in current nest.  */
 770           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 771             {
 772               gimple phi = gsi_stmt (si);
 773               gimple_set_uid (phi, 0);
 774               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 775             }
 776
 777           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 778             {
 779               gimple stmt = gsi_stmt (si);
 780               gimple_set_uid (stmt, 0);
 781               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 782             }
 783         }
 784     }
 785
 786   /* CHECKME: We want to visit all BBs before their successors (except for
 787      latch blocks, for which this assertion wouldn't hold).  In the simple
 788      case of the loop forms we allow, a dfs order of the BBs would the same
 789      as reversed postorder traversal, so we are safe.  */
 790
 791    free (bbs);
 792    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 793    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 794                               bbs, loop->num_nodes, loop);
 795    gcc_assert (nbbs == loop->num_nodes);
 796
 797   LOOP_VINFO_BBS (res) = bbs;
 798   LOOP_VINFO_NITERS (res) = NULL;
 799   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 800   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 801   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 802   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
 803   LOOP_VINFO_VECT_FACTOR (res) = 0;
 804   LOOP_VINFO_LOOP_NEST (res) = VEC_alloc (loop_p, heap, 3);
 805   LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
 806   LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
 807   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 808   LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
 809     VEC_alloc (gimple, heap,
 810                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 811   LOOP_VINFO_MAY_ALIAS_DDRS (res) =
 812     VEC_alloc (ddr_p, heap,
 813                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 814   LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
 815   LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
 816   LOOP_VINFO_REDUCTION_CHAINS (res) = VEC_alloc (gimple, heap, 10);
 817   LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
 818   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 819   LOOP_VINFO_PEELING_HTAB (res) = NULL;
 820   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 821
 822   return res;
 823 }
 824
 825
 826 /* Function destroy_loop_vec_info.
 827
 828    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 829    stmts in the loop.  */
 830
 831 void
 832 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 833 {
 834   struct loop *loop;
 835   basic_block *bbs;
 836   int nbbs;
 837   gimple_stmt_iterator si;
 838   int j;
 839   VEC (slp_instance, heap) *slp_instances;
 840   slp_instance instance;
 841
 842   if (!loop_vinfo)
 843     return;
 844
 845   loop = LOOP_VINFO_LOOP (loop_vinfo);
 846
 847   bbs = LOOP_VINFO_BBS (loop_vinfo);
 848   nbbs = loop->num_nodes;
 849
 850   if (!clean_stmts)
 851     {
 852       free (LOOP_VINFO_BBS (loop_vinfo));
 853       free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 854       free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 855       VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
 856       VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 857       VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
 858
 859       free (loop_vinfo);
 860       loop->aux = NULL;
 861       return;
 862     }
 863
 864   for (j = 0; j < nbbs; j++)
 865     {
 866       basic_block bb = bbs[j];
 867       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 868         free_stmt_vec_info (gsi_stmt (si));
 869
 870       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 871         {
 872           gimple stmt = gsi_stmt (si);
 873           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 874
 875           if (stmt_info)
 876             {
 877               /* Check if this statement has a related "pattern stmt"
 878                  (introduced by the vectorizer during the pattern recognition
 879                  pass).  Free pattern's stmt_vec_info.  */
 880               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 881                   && vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info)))
 882                 free_stmt_vec_info (STMT_VINFO_RELATED_STMT (stmt_info));
 883
 884               /* Free stmt_vec_info.  */
 885               free_stmt_vec_info (stmt);
 886             }
 887
 888           gsi_next (&si);
 889         }
 890     }
 891
 892   free (LOOP_VINFO_BBS (loop_vinfo));
 893   free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 894   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 895   VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
 896   VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 897   VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
 898   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
 899   FOR_EACH_VEC_ELT (slp_instance, slp_instances, j, instance)
 900     vect_free_slp_instance (instance);
 901
 902   VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
 903   VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
 904   VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
 905   VEC_free (gimple, heap, LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo));
 906
 907   if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
 908     htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
 909
 910   free (loop_vinfo);
 911   loop->aux = NULL;
 912 }
 913
 914
 915 /* Function vect_analyze_loop_1.
 916
 917    Apply a set of analyses on LOOP, and create a loop_vec_info struct
 918    for it. The different analyses will record information in the
 919    loop_vec_info struct.  This is a subset of the analyses applied in
 920    vect_analyze_loop, to be applied on an inner-loop nested in the loop
 921    that is now considered for (outer-loop) vectorization.  */
 922
 923 static loop_vec_info
 924 vect_analyze_loop_1 (struct loop *loop)
 925 {
 926   loop_vec_info loop_vinfo;
 927
 928   if (vect_print_dump_info (REPORT_DETAILS))
 929     fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
 930
 931   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
 932
 933   loop_vinfo = vect_analyze_loop_form (loop);
 934   if (!loop_vinfo)
 935     {
 936       if (vect_print_dump_info (REPORT_DETAILS))
 937         fprintf (vect_dump, "bad inner-loop form.");
 938       return NULL;
 939     }
 940
 941   return loop_vinfo;
 942 }
 943
 944
 945 /* Function vect_analyze_loop_form.
 946
 947    Verify that certain CFG restrictions hold, including:
 948    - the loop has a pre-header
 949    - the loop has a single entry and exit
 950    - the loop exit condition is simple enough, and the number of iterations
 951      can be analyzed (a countable loop).  */
 952
 953 loop_vec_info
 954 vect_analyze_loop_form (struct loop *loop)
 955 {
 956   loop_vec_info loop_vinfo;
 957   gimple loop_cond;
 958   tree number_of_iterations = NULL;
 959   loop_vec_info inner_loop_vinfo = NULL;
 960
 961   if (vect_print_dump_info (REPORT_DETAILS))
 962     fprintf (vect_dump, "=== vect_analyze_loop_form ===");
 963
 964   /* Different restrictions apply when we are considering an inner-most loop,
 965      vs. an outer (nested) loop.
 966      (FORNOW. May want to relax some of these restrictions in the future).  */
 967
 968   if (!loop->inner)
 969     {
 970       /* Inner-most loop.  We currently require that the number of BBs is
 971          exactly 2 (the header and latch).  Vectorizable inner-most loops
 972          look like this:
 973
 974                         (pre-header)
 975                            |
 976                           header <--------+
 977                            | |            |
 978                            | +--> latch --+
 979                            |
 980                         (exit-bb)  */
 981
 982       if (loop->num_nodes != 2)
 983         {
 984           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 985             fprintf (vect_dump, "not vectorized: control flow in loop.");
 986           return NULL;
 987         }
 988
 989       if (empty_block_p (loop->header))
 990     {
 991           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 992             fprintf (vect_dump, "not vectorized: empty loop.");
 993       return NULL;
 994     }
 995     }
 996   else
 997     {
 998       struct loop *innerloop = loop->inner;
 999       edge entryedge;
1000
1001       /* Nested loop. We currently require that the loop is doubly-nested,
1002          contains a single inner loop, and the number of BBs is exactly 5.
1003          Vectorizable outer-loops look like this:
1004
1005                         (pre-header)
1006                            |
1007                           header <---+
1008                            |         |
1009                           inner-loop |
1010                            |         |
1011                           tail ------+
1012                            |
1013                         (exit-bb)
1014
1015          The inner-loop has the properties expected of inner-most loops
1016          as described above.  */
1017
1018       if ((loop->inner)->inner || (loop->inner)->next)
1019         {
1020           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1021             fprintf (vect_dump, "not vectorized: multiple nested loops.");
1022           return NULL;
1023         }
1024
1025       /* Analyze the inner-loop.  */
1026       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1027       if (!inner_loop_vinfo)
1028         {
1029           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1030             fprintf (vect_dump, "not vectorized: Bad inner loop.");
1031           return NULL;
1032         }
1033
1034       if (!expr_invariant_in_loop_p (loop,
1035                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1036         {
1037           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1038             fprintf (vect_dump,
1039                      "not vectorized: inner-loop count not invariant.");
1040           destroy_loop_vec_info (inner_loop_vinfo, true);
1041           return NULL;
1042         }
1043
1044       if (loop->num_nodes != 5)
1045         {
1046           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1047             fprintf (vect_dump, "not vectorized: control flow in loop.");
1048           destroy_loop_vec_info (inner_loop_vinfo, true);
1049           return NULL;
1050         }
1051
1052       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1053       entryedge = EDGE_PRED (innerloop->header, 0);
1054       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1055         entryedge = EDGE_PRED (innerloop->header, 1);
1056
1057       if (entryedge->src != loop->header
1058           || !single_exit (innerloop)
1059           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1060         {
1061           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1062             fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
1063           destroy_loop_vec_info (inner_loop_vinfo, true);
1064           return NULL;
1065         }
1066
1067       if (vect_print_dump_info (REPORT_DETAILS))
1068         fprintf (vect_dump, "Considering outer-loop vectorization.");
1069     }
1070
1071   if (!single_exit (loop)
1072       || EDGE_COUNT (loop->header->preds) != 2)
1073     {
1074       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1075         {
1076           if (!single_exit (loop))
1077             fprintf (vect_dump, "not vectorized: multiple exits.");
1078           else if (EDGE_COUNT (loop->header->preds) != 2)
1079             fprintf (vect_dump, "not vectorized: too many incoming edges.");
1080         }
1081       if (inner_loop_vinfo)
1082         destroy_loop_vec_info (inner_loop_vinfo, true);
1083       return NULL;
1084     }
1085
1086   /* We assume that the loop exit condition is at the end of the loop. i.e,
1087      that the loop is represented as a do-while (with a proper if-guard
1088      before the loop if needed), where the loop header contains all the
1089      executable statements, and the latch is empty.  */
1090   if (!empty_block_p (loop->latch)
1091         || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1092     {
1093       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1094         fprintf (vect_dump, "not vectorized: unexpected loop form.");
1095       if (inner_loop_vinfo)
1096         destroy_loop_vec_info (inner_loop_vinfo, true);
1097       return NULL;
1098     }
1099
1100   /* Make sure there exists a single-predecessor exit bb:  */
1101   if (!single_pred_p (single_exit (loop)->dest))
1102     {
1103       edge e = single_exit (loop);
1104       if (!(e->flags & EDGE_ABNORMAL))
1105         {
1106           split_loop_exit_edge (e);
1107           if (vect_print_dump_info (REPORT_DETAILS))
1108             fprintf (vect_dump, "split exit edge.");
1109         }
1110       else
1111         {
1112           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1113             fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1114           if (inner_loop_vinfo)
1115             destroy_loop_vec_info (inner_loop_vinfo, true);
1116           return NULL;
1117         }
1118     }
1119
1120   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1121   if (!loop_cond)
1122     {
1123       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1124         fprintf (vect_dump, "not vectorized: complicated exit condition.");
1125       if (inner_loop_vinfo)
1126         destroy_loop_vec_info (inner_loop_vinfo, true);
1127       return NULL;
1128     }
1129
1130   if (!number_of_iterations)
1131     {
1132       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1133         fprintf (vect_dump,
1134                  "not vectorized: number of iterations cannot be computed.");
1135       if (inner_loop_vinfo)
1136         destroy_loop_vec_info (inner_loop_vinfo, true);
1137       return NULL;
1138     }
1139
1140   if (chrec_contains_undetermined (number_of_iterations))
1141     {
1142       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1143         fprintf (vect_dump, "Infinite number of iterations.");
1144       if (inner_loop_vinfo)
1145         destroy_loop_vec_info (inner_loop_vinfo, true);
1146       return NULL;
1147     }
1148
1149   if (!NITERS_KNOWN_P (number_of_iterations))
1150     {
1151       if (vect_print_dump_info (REPORT_DETAILS))
1152         {
1153           fprintf (vect_dump, "Symbolic number of iterations is ");
1154           print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1155         }
1156     }
1157   else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1158     {
1159       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1160         fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1161       if (inner_loop_vinfo)
1162         destroy_loop_vec_info (inner_loop_vinfo, false);
1163       return NULL;
1164     }
1165
1166   loop_vinfo = new_loop_vec_info (loop);
1167   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1168   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1169
1170   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1171
1172   /* CHECKME: May want to keep it around it in the future.  */
1173   if (inner_loop_vinfo)
1174     destroy_loop_vec_info (inner_loop_vinfo, false);
1175
1176   gcc_assert (!loop->aux);
1177   loop->aux = loop_vinfo;
1178   return loop_vinfo;
1179 }
1180
1181
1182 /* Get cost by calling cost target builtin.  */
1183
1184 static inline int
1185 vect_get_cost (enum vect_cost_for_stmt type_of_cost)
1186 {
1187   tree dummy_type = NULL;
1188   int dummy = 0;
1189
1190   return targetm.vectorize.builtin_vectorization_cost (type_of_cost,
1191                                                        dummy_type, dummy);
1192 }
1193
1194
1195 /* Function vect_analyze_loop_operations.
1196
1197    Scan the loop stmts and make sure they are all vectorizable.  */
1198
1199 static bool
1200 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1201 {
1202   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1203   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1204   int nbbs = loop->num_nodes;
1205   gimple_stmt_iterator si;
1206   unsigned int vectorization_factor = 0;
1207   int i;
1208   gimple phi;
1209   stmt_vec_info stmt_info;
1210   bool need_to_vectorize = false;
1211   int min_profitable_iters;
1212   int min_scalar_loop_bound;
1213   unsigned int th;
1214   bool only_slp_in_loop = true, ok;
1215
1216   if (vect_print_dump_info (REPORT_DETAILS))
1217     fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1218
1219   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1220   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1221   if (slp)
1222     {
1223       /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1224          vectorization factor of the loop is the unrolling factor required by
1225          the SLP instances.  If that unrolling factor is 1, we say, that we
1226          perform pure SLP on loop - cross iteration parallelism is not
1227          exploited.  */
1228       for (i = 0; i < nbbs; i++)
1229         {
1230           basic_block bb = bbs[i];
1231           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1232             {
1233               gimple stmt = gsi_stmt (si);
1234               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1235               gcc_assert (stmt_info);
1236               if ((STMT_VINFO_RELEVANT_P (stmt_info)
1237                    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1238                   && !PURE_SLP_STMT (stmt_info))
1239                 /* STMT needs both SLP and loop-based vectorization.  */
1240                 only_slp_in_loop = false;
1241             }
1242         }
1243
1244       if (only_slp_in_loop)
1245         vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1246       else
1247         vectorization_factor = least_common_multiple (vectorization_factor,
1248                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1249
1250       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1251       if (vect_print_dump_info (REPORT_DETAILS))
1252         fprintf (vect_dump, "Updating vectorization factor to %d ",
1253                             vectorization_factor);
1254     }
1255
1256   for (i = 0; i < nbbs; i++)
1257     {
1258       basic_block bb = bbs[i];
1259
1260       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1261         {
1262           phi = gsi_stmt (si);
1263           ok = true;
1264
1265           stmt_info = vinfo_for_stmt (phi);
1266           if (vect_print_dump_info (REPORT_DETAILS))
1267             {
1268               fprintf (vect_dump, "examining phi: ");
1269               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1270             }
1271
1272           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1273              (i.e., a phi in the tail of the outer-loop).  */
1274           if (! is_loop_header_bb_p (bb))
1275             {
1276               /* FORNOW: we currently don't support the case that these phis
1277                  are not used in the outerloop (unless it is double reduction,
1278                  i.e., this phi is vect_reduction_def), cause this case
1279                  requires to actually do something here.  */
1280               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1281                    || STMT_VINFO_LIVE_P (stmt_info))
1282                   && STMT_VINFO_DEF_TYPE (stmt_info)
1283                      != vect_double_reduction_def)
1284                 {
1285                   if (vect_print_dump_info (REPORT_DETAILS))
1286                     fprintf (vect_dump,
1287                              "Unsupported loop-closed phi in outer-loop.");
1288                   return false;
1289                 }
1290
1291               /* If PHI is used in the outer loop, we check that its operand
1292                  is defined in the inner loop.  */
1293               if (STMT_VINFO_RELEVANT_P (stmt_info))
1294                 {
1295                   tree phi_op;
1296                   gimple op_def_stmt;
1297
1298                   if (gimple_phi_num_args (phi) != 1)
1299                     return false;
1300
1301                   phi_op = PHI_ARG_DEF (phi, 0);
1302                   if (TREE_CODE (phi_op) != SSA_NAME)
1303                     return false;
1304
1305                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1306                   if (!op_def_stmt || !vinfo_for_stmt (op_def_stmt))
1307                     return false;
1308
1309                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1310                         != vect_used_in_outer
1311                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1312                            != vect_used_in_outer_by_reduction)
1313                     return false;
1314                 }
1315
1316               continue;
1317             }
1318
1319           gcc_assert (stmt_info);
1320
1321           if (STMT_VINFO_LIVE_P (stmt_info))
1322             {
1323               /* FORNOW: not yet supported.  */
1324               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1325                 fprintf (vect_dump, "not vectorized: value used after loop.");
1326               return false;
1327             }
1328
1329           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1330               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1331             {
1332               /* A scalar-dependence cycle that we don't support.  */
1333               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1334                 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1335               return false;
1336             }
1337
1338           if (STMT_VINFO_RELEVANT_P (stmt_info))
1339             {
1340               need_to_vectorize = true;
1341               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1342                 ok = vectorizable_induction (phi, NULL, NULL);
1343             }
1344
1345           if (!ok)
1346             {
1347               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1348                 {
1349                   fprintf (vect_dump,
1350                            "not vectorized: relevant phi not supported: ");
1351                   print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1352                 }
1353               return false;
1354             }
1355         }
1356
1357       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1358         {
1359           gimple stmt = gsi_stmt (si);
1360           if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1361             return false;
1362         }
1363     } /* bbs */
1364
1365   /* All operations in the loop are either irrelevant (deal with loop
1366      control, or dead), or only used outside the loop and can be moved
1367      out of the loop (e.g. invariants, inductions).  The loop can be
1368      optimized away by scalar optimizations.  We're better off not
1369      touching this loop.  */
1370   if (!need_to_vectorize)
1371     {
1372       if (vect_print_dump_info (REPORT_DETAILS))
1373         fprintf (vect_dump,
1374                  "All the computation can be taken out of the loop.");
1375       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1376         fprintf (vect_dump,
1377                  "not vectorized: redundant loop. no profit to vectorize.");
1378       return false;
1379     }
1380
1381   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1382       && vect_print_dump_info (REPORT_DETAILS))
1383     fprintf (vect_dump,
1384         "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1385         vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1386
1387   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1388       && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1389     {
1390       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1391         fprintf (vect_dump, "not vectorized: iteration count too small.");
1392       if (vect_print_dump_info (REPORT_DETAILS))
1393         fprintf (vect_dump,"not vectorized: iteration count smaller than "
1394                  "vectorization factor.");
1395       return false;
1396     }
1397
1398   /* Analyze cost.  Decide if worth while to vectorize.  */
1399
1400   /* Once VF is set, SLP costs should be updated since the number of created
1401      vector stmts depends on VF.  */
1402   vect_update_slp_costs_according_to_vf (loop_vinfo);
1403
1404   min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1405   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1406
1407   if (min_profitable_iters < 0)
1408     {
1409       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1410         fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1411       if (vect_print_dump_info (REPORT_DETAILS))
1412         fprintf (vect_dump, "not vectorized: vector version will never be "
1413                  "profitable.");
1414       return false;
1415     }
1416
1417   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1418                             * vectorization_factor) - 1);
1419
1420   /* Use the cost model only if it is more conservative than user specified
1421      threshold.  */
1422
1423   th = (unsigned) min_scalar_loop_bound;
1424   if (min_profitable_iters
1425       && (!min_scalar_loop_bound
1426           || min_profitable_iters > min_scalar_loop_bound))
1427     th = (unsigned) min_profitable_iters;
1428
1429   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1430       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1431     {
1432       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1433         fprintf (vect_dump, "not vectorized: vectorization not "
1434                  "profitable.");
1435       if (vect_print_dump_info (REPORT_DETAILS))
1436         fprintf (vect_dump, "not vectorized: iteration count smaller than "
1437                  "user specified loop bound parameter or minimum "
1438                  "profitable iterations (whichever is more conservative).");
1439       return false;
1440     }
1441
1442   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1443       || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1444       || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1445     {
1446       if (vect_print_dump_info (REPORT_DETAILS))
1447         fprintf (vect_dump, "epilog loop required.");
1448       if (!vect_can_advance_ivs_p (loop_vinfo))
1449         {
1450           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1451             fprintf (vect_dump,
1452                      "not vectorized: can't create epilog loop 1.");
1453           return false;
1454         }
1455       if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1456         {
1457           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1458             fprintf (vect_dump,
1459                      "not vectorized: can't create epilog loop 2.");
1460           return false;
1461         }
1462     }
1463
1464   return true;
1465 }
1466
1467
1468 /* Function vect_analyze_loop_2.
1469
1470    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1471    for it.  The different analyses will record information in the
1472    loop_vec_info struct.  */
1473 static bool
1474 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1475 {
1476   bool ok, dummy, slp = false;
1477   int max_vf = MAX_VECTORIZATION_FACTOR;
1478   int min_vf = 2;
1479
1480   /* Find all data references in the loop (which correspond to vdefs/vuses)
1481      and analyze their evolution in the loop.  Also adjust the minimal
1482      vectorization factor according to the loads and stores.
1483
1484      FORNOW: Handle only simple, array references, which
1485      alignment can be forced, and aligned pointer-references.  */
1486
1487   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1488   if (!ok)
1489     {
1490       if (vect_print_dump_info (REPORT_DETAILS))
1491         fprintf (vect_dump, "bad data references.");
1492       return false;
1493     }
1494
1495   /* Classify all cross-iteration scalar data-flow cycles.
1496      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1497
1498   vect_analyze_scalar_cycles (loop_vinfo);
1499
1500   vect_pattern_recog (loop_vinfo);
1501
1502   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1503
1504   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1505   if (!ok)
1506     {
1507       if (vect_print_dump_info (REPORT_DETAILS))
1508         fprintf (vect_dump, "unexpected pattern.");
1509       return false;
1510     }
1511
1512   /* Analyze data dependences between the data-refs in the loop
1513      and adjust the maximum vectorization factor according to
1514      the dependences.
1515      FORNOW: fail at the first data dependence that we encounter.  */
1516
1517   ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf, &dummy);
1518   if (!ok
1519       || max_vf < min_vf)
1520     {
1521       if (vect_print_dump_info (REPORT_DETAILS))
1522         fprintf (vect_dump, "bad data dependence.");
1523       return false;
1524     }
1525
1526   ok = vect_determine_vectorization_factor (loop_vinfo);
1527   if (!ok)
1528     {
1529       if (vect_print_dump_info (REPORT_DETAILS))
1530         fprintf (vect_dump, "can't determine vectorization factor.");
1531       return false;
1532     }
1533   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1534     {
1535       if (vect_print_dump_info (REPORT_DETAILS))
1536         fprintf (vect_dump, "bad data dependence.");
1537       return false;
1538     }
1539
1540   /* Analyze the alignment of the data-refs in the loop.
1541      Fail if a data reference is found that cannot be vectorized.  */
1542
1543   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1544   if (!ok)
1545     {
1546       if (vect_print_dump_info (REPORT_DETAILS))
1547         fprintf (vect_dump, "bad data alignment.");
1548       return false;
1549     }
1550
1551   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1552      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1553
1554   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1555   if (!ok)
1556     {
1557       if (vect_print_dump_info (REPORT_DETAILS))
1558         fprintf (vect_dump, "bad data access.");
1559       return false;
1560     }
1561
1562   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1563      It is important to call pruning after vect_analyze_data_ref_accesses,
1564      since we use grouping information gathered by interleaving analysis.  */
1565   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1566   if (!ok)
1567     {
1568       if (vect_print_dump_info (REPORT_DETAILS))
1569         fprintf (vect_dump, "too long list of versioning for alias "
1570                             "run-time tests.");
1571       return false;
1572     }
1573
1574   /* This pass will decide on using loop versioning and/or loop peeling in
1575      order to enhance the alignment of data references in the loop.  */
1576
1577   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1578   if (!ok)
1579     {
1580       if (vect_print_dump_info (REPORT_DETAILS))
1581         fprintf (vect_dump, "bad data alignment.");
1582       return false;
1583     }
1584
1585   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1586   ok = vect_analyze_slp (loop_vinfo, NULL);
1587   if (ok)
1588     {
1589       /* Decide which possible SLP instances to SLP.  */
1590       slp = vect_make_slp_decision (loop_vinfo);
1591
1592       /* Find stmts that need to be both vectorized and SLPed.  */
1593       vect_detect_hybrid_slp (loop_vinfo);
1594     }
1595   else
1596     return false;
1597
1598   /* Scan all the operations in the loop and make sure they are
1599      vectorizable.  */
1600
1601   ok = vect_analyze_loop_operations (loop_vinfo, slp);
1602   if (!ok)
1603     {
1604       if (vect_print_dump_info (REPORT_DETAILS))
1605         fprintf (vect_dump, "bad operation or unsupported loop bound.");
1606       return false;
1607     }
1608
1609   return true;
1610 }
1611
1612 /* Function vect_analyze_loop.
1613
1614    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1615    for it.  The different analyses will record information in the
1616    loop_vec_info struct.  */
1617 loop_vec_info
1618 vect_analyze_loop (struct loop *loop)
1619 {
1620   loop_vec_info loop_vinfo;
1621   unsigned int vector_sizes;
1622
1623   /* Autodetect first vector size we try.  */
1624   current_vector_size = 0;
1625   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1626
1627   if (vect_print_dump_info (REPORT_DETAILS))
1628     fprintf (vect_dump, "===== analyze_loop_nest =====");
1629
1630   if (loop_outer (loop)
1631       && loop_vec_info_for_loop (loop_outer (loop))
1632       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1633     {
1634       if (vect_print_dump_info (REPORT_DETAILS))
1635         fprintf (vect_dump, "outer-loop already vectorized.");
1636       return NULL;
1637     }
1638
1639   while (1)
1640     {
1641       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1642       loop_vinfo = vect_analyze_loop_form (loop);
1643       if (!loop_vinfo)
1644         {
1645           if (vect_print_dump_info (REPORT_DETAILS))
1646             fprintf (vect_dump, "bad loop form.");
1647           return NULL;
1648         }
1649
1650       if (vect_analyze_loop_2 (loop_vinfo))
1651         {
1652           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1653
1654           return loop_vinfo;
1655         }
1656
1657       destroy_loop_vec_info (loop_vinfo, true);
1658
1659       vector_sizes &= ~current_vector_size;
1660       if (vector_sizes == 0
1661           || current_vector_size == 0)
1662         return NULL;
1663
1664       /* Try the next biggest vector size.  */
1665       current_vector_size = 1 << floor_log2 (vector_sizes);
1666       if (vect_print_dump_info (REPORT_DETAILS))
1667         fprintf (vect_dump, "***** Re-trying analysis with "
1668                  "vector size %d\n", current_vector_size);
1669     }
1670 }
1671
1672
1673 /* Function reduction_code_for_scalar_code
1674
1675    Input:
1676    CODE - tree_code of a reduction operations.
1677
1678    Output:
1679    REDUC_CODE - the corresponding tree-code to be used to reduce the
1680       vector of partial results into a single scalar result (which
1681       will also reside in a vector) or ERROR_MARK if the operation is
1682       a supported reduction operation, but does not have such tree-code.
1683
1684    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1685
1686 static bool
1687 reduction_code_for_scalar_code (enum tree_code code,
1688                                 enum tree_code *reduc_code)
1689 {
1690   switch (code)
1691     {
1692       case MAX_EXPR:
1693         *reduc_code = REDUC_MAX_EXPR;
1694         return true;
1695
1696       case MIN_EXPR:
1697         *reduc_code = REDUC_MIN_EXPR;
1698         return true;
1699
1700       case PLUS_EXPR:
1701         *reduc_code = REDUC_PLUS_EXPR;
1702         return true;
1703
1704       case MULT_EXPR:
1705       case MINUS_EXPR:
1706       case BIT_IOR_EXPR:
1707       case BIT_XOR_EXPR:
1708       case BIT_AND_EXPR:
1709         *reduc_code = ERROR_MARK;
1710         return true;
1711
1712       default:
1713        return false;
1714     }
1715 }
1716
1717
1718 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1719    STMT is printed with a message MSG. */
1720
1721 static void
1722 report_vect_op (gimple stmt, const char *msg)
1723 {
1724   fprintf (vect_dump, "%s", msg);
1725   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1726 }
1727
1728
1729 /* Detect SLP reduction of the form:
1730
1731    #a1 = phi <a5, a0>
1732    a2 = operation (a1)
1733    a3 = operation (a2)
1734    a4 = operation (a3)
1735    a5 = operation (a4)
1736
1737    #a = phi <a5>
1738
1739    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1740    FIRST_STMT is the first reduction stmt in the chain
1741    (a2 = operation (a1)).
1742
1743    Return TRUE if a reduction chain was detected.  */
1744
1745 static bool
1746 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1747 {
1748   struct loop *loop = (gimple_bb (phi))->loop_father;
1749   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1750   enum tree_code code;
1751   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1752   stmt_vec_info use_stmt_info, current_stmt_info;
1753   tree lhs;
1754   imm_use_iterator imm_iter;
1755   use_operand_p use_p;
1756   int nloop_uses, size = 0, n_out_of_loop_uses;
1757   bool found = false;
1758
1759   if (loop != vect_loop)
1760     return false;
1761
1762   lhs = PHI_RESULT (phi);
1763   code = gimple_assign_rhs_code (first_stmt);
1764   while (1)
1765     {
1766       nloop_uses = 0;
1767       n_out_of_loop_uses = 0;
1768       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1769         {
1770           gimple use_stmt = USE_STMT (use_p);
1771           if (is_gimple_debug (use_stmt))
1772             continue;
1773
1774           use_stmt = USE_STMT (use_p);
1775
1776           /* Check if we got back to the reduction phi.  */
1777           if (use_stmt == phi)
1778             {
1779               loop_use_stmt = use_stmt;
1780               found = true;
1781               break;
1782             }
1783
1784           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1785             {
1786               if (vinfo_for_stmt (use_stmt)
1787                   && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1788                 {
1789                   loop_use_stmt = use_stmt;
1790                   nloop_uses++;
1791                 }
1792             }
1793            else
1794              n_out_of_loop_uses++;
1795
1796            /* There are can be either a single use in the loop or two uses in
1797               phi nodes.  */
1798            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1799              return false;
1800         }
1801
1802       if (found)
1803         break;
1804
1805       /* We reached a statement with no loop uses.  */
1806       if (nloop_uses == 0)
1807         return false;
1808
1809       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
1810       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1811         return false;
1812
1813       if (!is_gimple_assign (loop_use_stmt)
1814           || code != gimple_assign_rhs_code (loop_use_stmt)
1815           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1816         return false;
1817
1818       /* Insert USE_STMT into reduction chain.  */
1819       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
1820       if (current_stmt)
1821         {
1822           current_stmt_info = vinfo_for_stmt (current_stmt);
1823           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
1824           GROUP_FIRST_ELEMENT (use_stmt_info)
1825             = GROUP_FIRST_ELEMENT (current_stmt_info);
1826         }
1827       else
1828         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
1829
1830       lhs = gimple_assign_lhs (loop_use_stmt);
1831       current_stmt = loop_use_stmt;
1832       size++;
1833    }
1834
1835   if (!found || loop_use_stmt != phi || size < 2)
1836     return false;
1837
1838   /* Swap the operands, if needed, to make the reduction operand be the second
1839      operand.  */
1840   lhs = PHI_RESULT (phi);
1841   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1842   while (next_stmt)
1843     {
1844       if (gimple_assign_rhs2 (next_stmt) == lhs)
1845         {
1846           tree op = gimple_assign_rhs1 (next_stmt);
1847           gimple def_stmt = NULL;
1848
1849           if (TREE_CODE (op) == SSA_NAME)
1850             def_stmt = SSA_NAME_DEF_STMT (op);
1851
1852           /* Check that the other def is either defined in the loop
1853              ("vect_internal_def"), or it's an induction (defined by a
1854              loop-header phi-node).  */
1855           if (def_stmt
1856               && gimple_bb (def_stmt)
1857               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1858               && (is_gimple_assign (def_stmt)
1859                   || is_gimple_call (def_stmt)
1860                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1861                            == vect_induction_def
1862                   || (gimple_code (def_stmt) == GIMPLE_PHI
1863                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1864                                   == vect_internal_def
1865                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1866             {
1867               lhs = gimple_assign_lhs (next_stmt);
1868               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1869               continue;
1870             }
1871
1872           return false;
1873         }
1874       else
1875         {
1876           tree op = gimple_assign_rhs2 (next_stmt);
1877           gimple def_stmt = NULL;
1878
1879           if (TREE_CODE (op) == SSA_NAME)
1880             def_stmt = SSA_NAME_DEF_STMT (op);
1881
1882           /* Check that the other def is either defined in the loop
1883             ("vect_internal_def"), or it's an induction (defined by a
1884             loop-header phi-node).  */
1885           if (def_stmt
1886               && gimple_bb (def_stmt)
1887               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1888               && (is_gimple_assign (def_stmt)
1889                   || is_gimple_call (def_stmt)
1890                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1891                               == vect_induction_def
1892                   || (gimple_code (def_stmt) == GIMPLE_PHI
1893                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1894                                   == vect_internal_def
1895                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1896             {
1897               if (vect_print_dump_info (REPORT_DETAILS))
1898                 {
1899                   fprintf (vect_dump, "swapping oprnds: ");
1900                   print_gimple_stmt (vect_dump, next_stmt, 0, TDF_SLIM);
1901                 }
1902
1903               swap_tree_operands (next_stmt,
1904                                   gimple_assign_rhs1_ptr (next_stmt),
1905                                   gimple_assign_rhs2_ptr (next_stmt));
1906               mark_symbols_for_renaming (next_stmt);
1907             }
1908           else
1909             return false;
1910         }
1911
1912       lhs = gimple_assign_lhs (next_stmt);
1913       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1914     }
1915
1916   /* Save the chain for further analysis in SLP detection.  */
1917   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1918   VEC_safe_push (gimple, heap, LOOP_VINFO_REDUCTION_CHAINS (loop_info), first);
1919   GROUP_SIZE (vinfo_for_stmt (first)) = size;
1920
1921   return true;
1922 }
1923
1924
1925 /* Function vect_is_simple_reduction_1
1926
1927    (1) Detect a cross-iteration def-use cycle that represents a simple
1928    reduction computation.  We look for the following pattern:
1929
1930    loop_header:
1931      a1 = phi < a0, a2 >
1932      a3 = ...
1933      a2 = operation (a3, a1)
1934
1935    such that:
1936    1. operation is commutative and associative and it is safe to
1937       change the order of the computation (if CHECK_REDUCTION is true)
1938    2. no uses for a2 in the loop (a2 is used out of the loop)
1939    3. no uses of a1 in the loop besides the reduction operation
1940    4. no uses of a1 outside the loop.
1941
1942    Conditions 1,4 are tested here.
1943    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1944
1945    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1946    nested cycles, if CHECK_REDUCTION is false.
1947
1948    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1949    reductions:
1950
1951      a1 = phi < a0, a2 >
1952      inner loop (def of a3)
1953      a2 = phi < a3 >
1954
1955    If MODIFY is true it tries also to rework the code in-place to enable
1956    detection of more reduction patterns.  For the time being we rewrite
1957    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
1958 */
1959
1960 static gimple
1961 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
1962                             bool check_reduction, bool *double_reduc,
1963                             bool modify)
1964 {
1965   struct loop *loop = (gimple_bb (phi))->loop_father;
1966   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1967   edge latch_e = loop_latch_edge (loop);
1968   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1969   gimple def_stmt, def1 = NULL, def2 = NULL;
1970   enum tree_code orig_code, code;
1971   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1972   tree type;
1973   int nloop_uses;
1974   tree name;
1975   imm_use_iterator imm_iter;
1976   use_operand_p use_p;
1977   bool phi_def;
1978
1979   *double_reduc = false;
1980
1981   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1982      otherwise, we assume outer loop vectorization.  */
1983   gcc_assert ((check_reduction && loop == vect_loop)
1984               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1985
1986   name = PHI_RESULT (phi);
1987   nloop_uses = 0;
1988   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1989     {
1990       gimple use_stmt = USE_STMT (use_p);
1991       if (is_gimple_debug (use_stmt))
1992         continue;
1993
1994       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1995         {
1996           if (vect_print_dump_info (REPORT_DETAILS))
1997             fprintf (vect_dump, "intermediate value used outside loop.");
1998
1999           return NULL;
2000         }
2001
2002       if (vinfo_for_stmt (use_stmt)
2003           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2004         nloop_uses++;
2005       if (nloop_uses > 1)
2006         {
2007           if (vect_print_dump_info (REPORT_DETAILS))
2008             fprintf (vect_dump, "reduction used in loop.");
2009           return NULL;
2010         }
2011     }
2012
2013   if (TREE_CODE (loop_arg) != SSA_NAME)
2014     {
2015       if (vect_print_dump_info (REPORT_DETAILS))
2016         {
2017           fprintf (vect_dump, "reduction: not ssa_name: ");
2018           print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
2019         }
2020       return NULL;
2021     }
2022
2023   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2024   if (!def_stmt)
2025     {
2026       if (vect_print_dump_info (REPORT_DETAILS))
2027         fprintf (vect_dump, "reduction: no def_stmt.");
2028       return NULL;
2029     }
2030
2031   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2032     {
2033       if (vect_print_dump_info (REPORT_DETAILS))
2034         print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
2035       return NULL;
2036     }
2037
2038   if (is_gimple_assign (def_stmt))
2039     {
2040       name = gimple_assign_lhs (def_stmt);
2041       phi_def = false;
2042     }
2043   else
2044     {
2045       name = PHI_RESULT (def_stmt);
2046       phi_def = true;
2047     }
2048
2049   nloop_uses = 0;
2050   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2051     {
2052       gimple use_stmt = USE_STMT (use_p);
2053       if (is_gimple_debug (use_stmt))
2054         continue;
2055       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2056           && vinfo_for_stmt (use_stmt)
2057           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2058         nloop_uses++;
2059       if (nloop_uses > 1)
2060         {
2061           if (vect_print_dump_info (REPORT_DETAILS))
2062             fprintf (vect_dump, "reduction used in loop.");
2063           return NULL;
2064         }
2065     }
2066
2067   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2068      defined in the inner loop.  */
2069   if (phi_def)
2070     {
2071       op1 = PHI_ARG_DEF (def_stmt, 0);
2072
2073       if (gimple_phi_num_args (def_stmt) != 1
2074           || TREE_CODE (op1) != SSA_NAME)
2075         {
2076           if (vect_print_dump_info (REPORT_DETAILS))
2077             fprintf (vect_dump, "unsupported phi node definition.");
2078
2079           return NULL;
2080         }
2081
2082       def1 = SSA_NAME_DEF_STMT (op1);
2083       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2084           && loop->inner
2085           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2086           && is_gimple_assign (def1))
2087         {
2088           if (vect_print_dump_info (REPORT_DETAILS))
2089             report_vect_op (def_stmt, "detected double reduction: ");
2090
2091           *double_reduc = true;
2092           return def_stmt;
2093         }
2094
2095       return NULL;
2096     }
2097
2098   code = orig_code = gimple_assign_rhs_code (def_stmt);
2099
2100   /* We can handle "res -= x[i]", which is non-associative by
2101      simply rewriting this into "res += -x[i]".  Avoid changing
2102      gimple instruction for the first simple tests and only do this
2103      if we're allowed to change code at all.  */
2104   if (code == MINUS_EXPR
2105       && modify
2106       && (op1 = gimple_assign_rhs1 (def_stmt))
2107       && TREE_CODE (op1) == SSA_NAME
2108       && SSA_NAME_DEF_STMT (op1) == phi)
2109     code = PLUS_EXPR;
2110
2111   if (check_reduction
2112       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2113     {
2114       if (vect_print_dump_info (REPORT_DETAILS))
2115         report_vect_op (def_stmt, "reduction: not commutative/associative: ");
2116       return NULL;
2117     }
2118
2119   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2120     {
2121       if (code != COND_EXPR)
2122         {
2123           if (vect_print_dump_info (REPORT_DETAILS))
2124             report_vect_op (def_stmt, "reduction: not binary operation: ");
2125
2126           return NULL;
2127         }
2128
2129       op3 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
2130       if (COMPARISON_CLASS_P (op3))
2131         {
2132           op4 = TREE_OPERAND (op3, 1);
2133           op3 = TREE_OPERAND (op3, 0);
2134         }
2135
2136       op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1);
2137       op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2);
2138
2139       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2140         {
2141           if (vect_print_dump_info (REPORT_DETAILS))
2142             report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
2143
2144           return NULL;
2145         }
2146     }
2147   else
2148     {
2149       op1 = gimple_assign_rhs1 (def_stmt);
2150       op2 = gimple_assign_rhs2 (def_stmt);
2151
2152       if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
2153         {
2154           if (vect_print_dump_info (REPORT_DETAILS))
2155             report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
2156
2157           return NULL;
2158         }
2159    }
2160
2161   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2162   if ((TREE_CODE (op1) == SSA_NAME
2163        && !types_compatible_p (type,TREE_TYPE (op1)))
2164       || (TREE_CODE (op2) == SSA_NAME
2165           && !types_compatible_p (type, TREE_TYPE (op2)))
2166       || (op3 && TREE_CODE (op3) == SSA_NAME
2167           && !types_compatible_p (type, TREE_TYPE (op3)))
2168       || (op4 && TREE_CODE (op4) == SSA_NAME
2169           && !types_compatible_p (type, TREE_TYPE (op4))))
2170     {
2171       if (vect_print_dump_info (REPORT_DETAILS))
2172         {
2173           fprintf (vect_dump, "reduction: multiple types: operation type: ");
2174           print_generic_expr (vect_dump, type, TDF_SLIM);
2175           fprintf (vect_dump, ", operands types: ");
2176           print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
2177           fprintf (vect_dump, ",");
2178           print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
2179           if (op3)
2180             {
2181               fprintf (vect_dump, ",");
2182               print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
2183             }
2184
2185           if (op4)
2186             {
2187               fprintf (vect_dump, ",");
2188               print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
2189             }
2190         }
2191
2192       return NULL;
2193     }
2194
2195   /* Check that it's ok to change the order of the computation.
2196      Generally, when vectorizing a reduction we change the order of the
2197      computation.  This may change the behavior of the program in some
2198      cases, so we need to check that this is ok.  One exception is when
2199      vectorizing an outer-loop: the inner-loop is executed sequentially,
2200      and therefore vectorizing reductions in the inner-loop during
2201      outer-loop vectorization is safe.  */
2202
2203   /* CHECKME: check for !flag_finite_math_only too?  */
2204   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2205       && check_reduction)
2206     {
2207       /* Changing the order of operations changes the semantics.  */
2208       if (vect_print_dump_info (REPORT_DETAILS))
2209         report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
2210       return NULL;
2211     }
2212   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2213            && check_reduction)
2214     {
2215       /* Changing the order of operations changes the semantics.  */
2216       if (vect_print_dump_info (REPORT_DETAILS))
2217         report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
2218       return NULL;
2219     }
2220   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2221     {
2222       /* Changing the order of operations changes the semantics.  */
2223       if (vect_print_dump_info (REPORT_DETAILS))
2224         report_vect_op (def_stmt,
2225                         "reduction: unsafe fixed-point math optimization: ");
2226       return NULL;
2227     }
2228
2229   /* If we detected "res -= x[i]" earlier, rewrite it into
2230      "res += -x[i]" now.  If this turns out to be useless reassoc
2231      will clean it up again.  */
2232   if (orig_code == MINUS_EXPR)
2233     {
2234       tree rhs = gimple_assign_rhs2 (def_stmt);
2235       tree negrhs = make_ssa_name (SSA_NAME_VAR (rhs), NULL);
2236       gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2237                                                          rhs, NULL);
2238       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2239       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2240                                                           loop_info, NULL));
2241       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2242       gimple_assign_set_rhs2 (def_stmt, negrhs);
2243       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2244       update_stmt (def_stmt);
2245     }
2246
2247   /* Reduction is safe. We're dealing with one of the following:
2248      1) integer arithmetic and no trapv
2249      2) floating point arithmetic, and special flags permit this optimization
2250      3) nested cycle (i.e., outer loop vectorization).  */
2251   if (TREE_CODE (op1) == SSA_NAME)
2252     def1 = SSA_NAME_DEF_STMT (op1);
2253
2254   if (TREE_CODE (op2) == SSA_NAME)
2255     def2 = SSA_NAME_DEF_STMT (op2);
2256
2257   if (code != COND_EXPR
2258       && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2)))
2259     {
2260       if (vect_print_dump_info (REPORT_DETAILS))
2261         report_vect_op (def_stmt, "reduction: no defs for operands: ");
2262       return NULL;
2263     }
2264
2265   /* Check that one def is the reduction def, defined by PHI,
2266      the other def is either defined in the loop ("vect_internal_def"),
2267      or it's an induction (defined by a loop-header phi-node).  */
2268
2269   if (def2 && def2 == phi
2270       && (code == COND_EXPR
2271           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2272               && (is_gimple_assign (def1)
2273                   || is_gimple_call (def1)
2274                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2275                       == vect_induction_def
2276                   || (gimple_code (def1) == GIMPLE_PHI
2277                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2278                           == vect_internal_def
2279                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2280     {
2281       if (vect_print_dump_info (REPORT_DETAILS))
2282         report_vect_op (def_stmt, "detected reduction: ");
2283       return def_stmt;
2284     }
2285
2286   if (def1 && def1 == phi
2287       && (code == COND_EXPR
2288           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2289               && (is_gimple_assign (def2)
2290                   || is_gimple_call (def2)
2291                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2292                       == vect_induction_def
2293                   || (gimple_code (def2) == GIMPLE_PHI
2294                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2295                           == vect_internal_def
2296                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2297     {
2298       if (check_reduction)
2299         {
2300           /* Swap operands (just for simplicity - so that the rest of the code
2301              can assume that the reduction variable is always the last (second)
2302              argument).  */
2303           if (vect_print_dump_info (REPORT_DETAILS))
2304             report_vect_op (def_stmt,
2305                             "detected reduction: need to swap operands: ");
2306
2307           swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2308                               gimple_assign_rhs2_ptr (def_stmt));
2309         }
2310       else
2311         {
2312           if (vect_print_dump_info (REPORT_DETAILS))
2313             report_vect_op (def_stmt, "detected reduction: ");
2314         }
2315
2316       return def_stmt;
2317     }
2318
2319   /* Try to find SLP reduction chain.  */
2320   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2321     {
2322       if (vect_print_dump_info (REPORT_DETAILS))
2323         report_vect_op (def_stmt, "reduction: detected reduction chain: ");
2324
2325       return def_stmt;
2326     }
2327
2328   if (vect_print_dump_info (REPORT_DETAILS))
2329     report_vect_op (def_stmt, "reduction: unknown pattern: ");
2330
2331   return NULL;
2332 }
2333
2334 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2335    in-place.  Arguments as there.  */
2336
2337 static gimple
2338 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2339                           bool check_reduction, bool *double_reduc)
2340 {
2341   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2342                                      double_reduc, false);
2343 }
2344
2345 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2346    in-place if it enables detection of more reductions.  Arguments
2347    as there.  */
2348
2349 gimple
2350 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2351                           bool check_reduction, bool *double_reduc)
2352 {
2353   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2354                                      double_reduc, true);
2355 }
2356
2357 /* Calculate the cost of one scalar iteration of the loop.  */
2358 int
2359 vect_get_single_scalar_iteraion_cost (loop_vec_info loop_vinfo)
2360 {
2361   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2362   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2363   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2364   int innerloop_iters, i, stmt_cost;
2365
2366   /* Count statements in scalar loop.  Using this as scalar cost for a single
2367      iteration for now.
2368
2369      TODO: Add outer loop support.
2370
2371      TODO: Consider assigning different costs to different scalar
2372      statements.  */
2373
2374   /* FORNOW.  */
2375   innerloop_iters = 1;
2376   if (loop->inner)
2377     innerloop_iters = 50; /* FIXME */
2378
2379   for (i = 0; i < nbbs; i++)
2380     {
2381       gimple_stmt_iterator si;
2382       basic_block bb = bbs[i];
2383
2384       if (bb->loop_father == loop->inner)
2385         factor = innerloop_iters;
2386       else
2387         factor = 1;
2388
2389       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2390         {
2391           gimple stmt = gsi_stmt (si);
2392           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2393
2394           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2395             continue;
2396
2397           /* Skip stmts that are not vectorized inside the loop.  */
2398           if (stmt_info
2399               && !STMT_VINFO_RELEVANT_P (stmt_info)
2400               && (!STMT_VINFO_LIVE_P (stmt_info)
2401                   || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2402             continue;
2403
2404           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2405             {
2406               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2407                stmt_cost = vect_get_cost (scalar_load);
2408              else
2409                stmt_cost = vect_get_cost (scalar_store);
2410             }
2411           else
2412             stmt_cost = vect_get_cost (scalar_stmt);
2413
2414           scalar_single_iter_cost += stmt_cost * factor;
2415         }
2416     }
2417   return scalar_single_iter_cost;
2418 }
2419
2420 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2421 int
2422 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2423                              int *peel_iters_epilogue,
2424                              int scalar_single_iter_cost)
2425 {
2426   int peel_guard_costs = 0;
2427   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2428
2429   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2430     {
2431       *peel_iters_epilogue = vf/2;
2432       if (vect_print_dump_info (REPORT_COST))
2433         fprintf (vect_dump, "cost model: "
2434                             "epilogue peel iters set to vf/2 because "
2435                             "loop iterations are unknown .");
2436
2437       /* If peeled iterations are known but number of scalar loop
2438          iterations are unknown, count a taken branch per peeled loop.  */
2439       peel_guard_costs =  2 * vect_get_cost (cond_branch_taken);
2440     }
2441   else
2442     {
2443       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2444       peel_iters_prologue = niters < peel_iters_prologue ?
2445                             niters : peel_iters_prologue;
2446       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2447       /* If we need to peel for gaps, but no peeling is required, we have to
2448          peel VF iterations.  */
2449       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2450         *peel_iters_epilogue = vf;
2451     }
2452
2453    return (peel_iters_prologue * scalar_single_iter_cost)
2454             + (*peel_iters_epilogue * scalar_single_iter_cost)
2455            + peel_guard_costs;
2456 }
2457
2458 /* Function vect_estimate_min_profitable_iters
2459
2460    Return the number of iterations required for the vector version of the
2461    loop to be profitable relative to the cost of the scalar version of the
2462    loop.
2463
2464    TODO: Take profile info into account before making vectorization
2465    decisions, if available.  */
2466
2467 int
2468 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
2469 {
2470   int i;
2471   int min_profitable_iters;
2472   int peel_iters_prologue;
2473   int peel_iters_epilogue;
2474   int vec_inside_cost = 0;
2475   int vec_outside_cost = 0;
2476   int scalar_single_iter_cost = 0;
2477   int scalar_outside_cost = 0;
2478   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2479   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2480   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2481   int nbbs = loop->num_nodes;
2482   int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2483   int peel_guard_costs = 0;
2484   int innerloop_iters = 0, factor;
2485   VEC (slp_instance, heap) *slp_instances;
2486   slp_instance instance;
2487
2488   /* Cost model disabled.  */
2489   if (!flag_vect_cost_model)
2490     {
2491       if (vect_print_dump_info (REPORT_COST))
2492         fprintf (vect_dump, "cost model disabled.");
2493       return 0;
2494     }
2495
2496   /* Requires loop versioning tests to handle misalignment.  */
2497   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2498     {
2499       /*  FIXME: Make cost depend on complexity of individual check.  */
2500       vec_outside_cost +=
2501         VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
2502       if (vect_print_dump_info (REPORT_COST))
2503         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2504                  "versioning to treat misalignment.\n");
2505     }
2506
2507   /* Requires loop versioning with alias checks.  */
2508   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2509     {
2510       /*  FIXME: Make cost depend on complexity of individual check.  */
2511       vec_outside_cost +=
2512         VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
2513       if (vect_print_dump_info (REPORT_COST))
2514         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2515                  "versioning aliasing.\n");
2516     }
2517
2518   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2519       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2520     vec_outside_cost += vect_get_cost (cond_branch_taken);
2521
2522   /* Count statements in scalar loop.  Using this as scalar cost for a single
2523      iteration for now.
2524
2525      TODO: Add outer loop support.
2526
2527      TODO: Consider assigning different costs to different scalar
2528      statements.  */
2529
2530   /* FORNOW.  */
2531   if (loop->inner)
2532     innerloop_iters = 50; /* FIXME */
2533
2534   for (i = 0; i < nbbs; i++)
2535     {
2536       gimple_stmt_iterator si;
2537       basic_block bb = bbs[i];
2538
2539       if (bb->loop_father == loop->inner)
2540         factor = innerloop_iters;
2541       else
2542         factor = 1;
2543
2544       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2545         {
2546           gimple stmt = gsi_stmt (si);
2547           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2548           /* Skip stmts that are not vectorized inside the loop.  */
2549           if (!STMT_VINFO_RELEVANT_P (stmt_info)
2550               && (!STMT_VINFO_LIVE_P (stmt_info)
2551                   || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2552             continue;
2553           vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
2554           /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
2555              some of the "outside" costs are generated inside the outer-loop.  */
2556           vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
2557         }
2558     }
2559
2560   scalar_single_iter_cost = vect_get_single_scalar_iteraion_cost (loop_vinfo);
2561
2562   /* Add additional cost for the peeled instructions in prologue and epilogue
2563      loop.
2564
2565      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2566      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2567
2568      TODO: Build an expression that represents peel_iters for prologue and
2569      epilogue to be used in a run-time test.  */
2570
2571   if (npeel  < 0)
2572     {
2573       peel_iters_prologue = vf/2;
2574       if (vect_print_dump_info (REPORT_COST))
2575         fprintf (vect_dump, "cost model: "
2576                  "prologue peel iters set to vf/2.");
2577
2578       /* If peeling for alignment is unknown, loop bound of main loop becomes
2579          unknown.  */
2580       peel_iters_epilogue = vf/2;
2581       if (vect_print_dump_info (REPORT_COST))
2582         fprintf (vect_dump, "cost model: "
2583                  "epilogue peel iters set to vf/2 because "
2584                  "peeling for alignment is unknown .");
2585
2586       /* If peeled iterations are unknown, count a taken branch and a not taken
2587          branch per peeled loop. Even if scalar loop iterations are known,
2588          vector iterations are not known since peeled prologue iterations are
2589          not known. Hence guards remain the same.  */
2590       peel_guard_costs +=  2 * (vect_get_cost (cond_branch_taken)
2591                                 + vect_get_cost (cond_branch_not_taken));
2592       vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2593                            + (peel_iters_epilogue * scalar_single_iter_cost)
2594                            + peel_guard_costs;
2595     }
2596   else
2597     {
2598       peel_iters_prologue = npeel;
2599       vec_outside_cost += vect_get_known_peeling_cost (loop_vinfo,
2600                                     peel_iters_prologue, &peel_iters_epilogue,
2601                                     scalar_single_iter_cost);
2602     }
2603
2604   /* FORNOW: The scalar outside cost is incremented in one of the
2605      following ways:
2606
2607      1. The vectorizer checks for alignment and aliasing and generates
2608      a condition that allows dynamic vectorization.  A cost model
2609      check is ANDED with the versioning condition.  Hence scalar code
2610      path now has the added cost of the versioning check.
2611
2612        if (cost > th & versioning_check)
2613          jmp to vector code
2614
2615      Hence run-time scalar is incremented by not-taken branch cost.
2616
2617      2. The vectorizer then checks if a prologue is required.  If the
2618      cost model check was not done before during versioning, it has to
2619      be done before the prologue check.
2620
2621        if (cost <= th)
2622          prologue = scalar_iters
2623        if (prologue == 0)
2624          jmp to vector code
2625        else
2626          execute prologue
2627        if (prologue == num_iters)
2628          go to exit
2629
2630      Hence the run-time scalar cost is incremented by a taken branch,
2631      plus a not-taken branch, plus a taken branch cost.
2632
2633      3. The vectorizer then checks if an epilogue is required.  If the
2634      cost model check was not done before during prologue check, it
2635      has to be done with the epilogue check.
2636
2637        if (prologue == 0)
2638          jmp to vector code
2639        else
2640          execute prologue
2641        if (prologue == num_iters)
2642          go to exit
2643        vector code:
2644          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2645            jmp to epilogue
2646
2647      Hence the run-time scalar cost should be incremented by 2 taken
2648      branches.
2649
2650      TODO: The back end may reorder the BBS's differently and reverse
2651      conditions/branch directions.  Change the estimates below to
2652      something more reasonable.  */
2653
2654   /* If the number of iterations is known and we do not do versioning, we can
2655      decide whether to vectorize at compile time.  Hence the scalar version
2656      do not carry cost model guard costs.  */
2657   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2658       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2659       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2660     {
2661       /* Cost model check occurs at versioning.  */
2662       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2663           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2664         scalar_outside_cost += vect_get_cost (cond_branch_not_taken);
2665       else
2666         {
2667           /* Cost model check occurs at prologue generation.  */
2668           if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2669             scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken)
2670                                    + vect_get_cost (cond_branch_not_taken);
2671           /* Cost model check occurs at epilogue generation.  */
2672           else
2673             scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken);
2674         }
2675     }
2676
2677   /* Add SLP costs.  */
2678   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2679   FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance)
2680     {
2681       vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2682       vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2683     }
2684
2685   /* Calculate number of iterations required to make the vector version
2686      profitable, relative to the loop bodies only.  The following condition
2687      must hold true:
2688      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2689      where
2690      SIC = scalar iteration cost, VIC = vector iteration cost,
2691      VOC = vector outside cost, VF = vectorization factor,
2692      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2693      SOC = scalar outside cost for run time cost model check.  */
2694
2695   if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2696     {
2697       if (vec_outside_cost <= 0)
2698         min_profitable_iters = 1;
2699       else
2700         {
2701           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2702                                   - vec_inside_cost * peel_iters_prologue
2703                                   - vec_inside_cost * peel_iters_epilogue)
2704                                  / ((scalar_single_iter_cost * vf)
2705                                     - vec_inside_cost);
2706
2707           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2708               <= ((vec_inside_cost * min_profitable_iters)
2709                   + ((vec_outside_cost - scalar_outside_cost) * vf)))
2710             min_profitable_iters++;
2711         }
2712     }
2713   /* vector version will never be profitable.  */
2714   else
2715     {
2716       if (vect_print_dump_info (REPORT_COST))
2717         fprintf (vect_dump, "cost model: the vector iteration cost = %d "
2718                  "divided by the scalar iteration cost = %d "
2719                  "is greater or equal to the vectorization factor = %d.",
2720                  vec_inside_cost, scalar_single_iter_cost, vf);
2721       return -1;
2722     }
2723
2724   if (vect_print_dump_info (REPORT_COST))
2725     {
2726       fprintf (vect_dump, "Cost model analysis: \n");
2727       fprintf (vect_dump, "  Vector inside of loop cost: %d\n",
2728                vec_inside_cost);
2729       fprintf (vect_dump, "  Vector outside of loop cost: %d\n",
2730                vec_outside_cost);
2731       fprintf (vect_dump, "  Scalar iteration cost: %d\n",
2732                scalar_single_iter_cost);
2733       fprintf (vect_dump, "  Scalar outside cost: %d\n", scalar_outside_cost);
2734       fprintf (vect_dump, "  prologue iterations: %d\n",
2735                peel_iters_prologue);
2736       fprintf (vect_dump, "  epilogue iterations: %d\n",
2737                peel_iters_epilogue);
2738       fprintf (vect_dump, "  Calculated minimum iters for profitability: %d\n",
2739                min_profitable_iters);
2740     }
2741
2742   min_profitable_iters =
2743         min_profitable_iters < vf ? vf : min_profitable_iters;
2744
2745   /* Because the condition we create is:
2746      if (niters <= min_profitable_iters)
2747        then skip the vectorized loop.  */
2748   min_profitable_iters--;
2749
2750   if (vect_print_dump_info (REPORT_COST))
2751     fprintf (vect_dump, "  Profitability threshold = %d\n",
2752              min_profitable_iters);
2753
2754   return min_profitable_iters;
2755 }
2756
2757
2758 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2759    functions. Design better to avoid maintenance issues.  */
2760
2761 /* Function vect_model_reduction_cost.
2762
2763    Models cost for a reduction operation, including the vector ops
2764    generated within the strip-mine loop, the initial definition before
2765    the loop, and the epilogue code that must be generated.  */
2766
2767 static bool
2768 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2769                            int ncopies)
2770 {
2771   int outer_cost = 0;
2772   enum tree_code code;
2773   optab optab;
2774   tree vectype;
2775   gimple stmt, orig_stmt;
2776   tree reduction_op;
2777   enum machine_mode mode;
2778   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2779   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2780
2781
2782   /* Cost of reduction op inside loop.  */
2783   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2784     += ncopies * vect_get_cost (vector_stmt);
2785
2786   stmt = STMT_VINFO_STMT (stmt_info);
2787
2788   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2789     {
2790     case GIMPLE_SINGLE_RHS:
2791       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2792       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2793       break;
2794     case GIMPLE_UNARY_RHS:
2795       reduction_op = gimple_assign_rhs1 (stmt);
2796       break;
2797     case GIMPLE_BINARY_RHS:
2798       reduction_op = gimple_assign_rhs2 (stmt);
2799       break;
2800     case GIMPLE_TERNARY_RHS:
2801       reduction_op = gimple_assign_rhs3 (stmt);
2802       break;
2803     default:
2804       gcc_unreachable ();
2805     }
2806
2807   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2808   if (!vectype)
2809     {
2810       if (vect_print_dump_info (REPORT_COST))
2811         {
2812           fprintf (vect_dump, "unsupported data-type ");
2813           print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2814         }
2815       return false;
2816    }
2817
2818   mode = TYPE_MODE (vectype);
2819   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2820
2821   if (!orig_stmt)
2822     orig_stmt = STMT_VINFO_STMT (stmt_info);
2823
2824   code = gimple_assign_rhs_code (orig_stmt);
2825
2826   /* Add in cost for initial definition.  */
2827   outer_cost += vect_get_cost (scalar_to_vec);
2828
2829   /* Determine cost of epilogue code.
2830
2831      We have a reduction operator that will reduce the vector in one statement.
2832      Also requires scalar extract.  */
2833
2834   if (!nested_in_vect_loop_p (loop, orig_stmt))
2835     {
2836       if (reduc_code != ERROR_MARK)
2837         outer_cost += vect_get_cost (vector_stmt)
2838                       + vect_get_cost (vec_to_scalar);
2839       else
2840         {
2841           int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2842           tree bitsize =
2843             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2844           int element_bitsize = tree_low_cst (bitsize, 1);
2845           int nelements = vec_size_in_bits / element_bitsize;
2846
2847           optab = optab_for_tree_code (code, vectype, optab_default);
2848
2849           /* We have a whole vector shift available.  */
2850           if (VECTOR_MODE_P (mode)
2851               && optab_handler (optab, mode) != CODE_FOR_nothing
2852               && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
2853             /* Final reduction via vector shifts and the reduction operator. Also
2854                requires scalar extract.  */
2855             outer_cost += ((exact_log2(nelements) * 2)
2856               * vect_get_cost (vector_stmt)
2857               + vect_get_cost (vec_to_scalar));
2858           else
2859             /* Use extracts and reduction op for final reduction.  For N elements,
2860                we have N extracts and N-1 reduction ops.  */
2861             outer_cost += ((nelements + nelements - 1)
2862               * vect_get_cost (vector_stmt));
2863         }
2864     }
2865
2866   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2867
2868   if (vect_print_dump_info (REPORT_COST))
2869     fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2870              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2871              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2872
2873   return true;
2874 }
2875
2876
2877 /* Function vect_model_induction_cost.
2878
2879    Models cost for induction operations.  */
2880
2881 static void
2882 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2883 {
2884   /* loop cost for vec_loop.  */
2885   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2886     = ncopies * vect_get_cost (vector_stmt);
2887   /* prologue cost for vec_init and vec_step.  */
2888   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)
2889     = 2 * vect_get_cost (scalar_to_vec);
2890
2891   if (vect_print_dump_info (REPORT_COST))
2892     fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2893              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2894              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2895 }
2896
2897
2898 /* Function get_initial_def_for_induction
2899
2900    Input:
2901    STMT - a stmt that performs an induction operation in the loop.
2902    IV_PHI - the initial value of the induction variable
2903
2904    Output:
2905    Return a vector variable, initialized with the first VF values of
2906    the induction variable.  E.g., for an iv with IV_PHI='X' and
2907    evolution S, for a vector of 4 units, we want to return:
2908    [X, X + S, X + 2*S, X + 3*S].  */
2909
2910 static tree
2911 get_initial_def_for_induction (gimple iv_phi)
2912 {
2913   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2914   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2915   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2916   tree scalar_type;
2917   tree vectype;
2918   int nunits;
2919   edge pe = loop_preheader_edge (loop);
2920   struct loop *iv_loop;
2921   basic_block new_bb;
2922   tree vec, vec_init, vec_step, t;
2923   tree access_fn;
2924   tree new_var;
2925   tree new_name;
2926   gimple init_stmt, induction_phi, new_stmt;
2927   tree induc_def, vec_def, vec_dest;
2928   tree init_expr, step_expr;
2929   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2930   int i;
2931   bool ok;
2932   int ncopies;
2933   tree expr;
2934   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2935   bool nested_in_vect_loop = false;
2936   gimple_seq stmts = NULL;
2937   imm_use_iterator imm_iter;
2938   use_operand_p use_p;
2939   gimple exit_phi;
2940   edge latch_e;
2941   tree loop_arg;
2942   gimple_stmt_iterator si;
2943   basic_block bb = gimple_bb (iv_phi);
2944   tree stepvectype;
2945   tree resvectype;
2946
2947   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
2948   if (nested_in_vect_loop_p (loop, iv_phi))
2949     {
2950       nested_in_vect_loop = true;
2951       iv_loop = loop->inner;
2952     }
2953   else
2954     iv_loop = loop;
2955   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2956
2957   latch_e = loop_latch_edge (iv_loop);
2958   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2959
2960   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2961   gcc_assert (access_fn);
2962   STRIP_NOPS (access_fn);
2963   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2964                                     &init_expr, &step_expr);
2965   gcc_assert (ok);
2966   pe = loop_preheader_edge (iv_loop);
2967
2968   scalar_type = TREE_TYPE (init_expr);
2969   vectype = get_vectype_for_scalar_type (scalar_type);
2970   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
2971   gcc_assert (vectype);
2972   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2973   ncopies = vf / nunits;
2974
2975   gcc_assert (phi_info);
2976   gcc_assert (ncopies >= 1);
2977
2978   /* Find the first insertion point in the BB.  */
2979   si = gsi_after_labels (bb);
2980
2981   /* Create the vector that holds the initial_value of the induction.  */
2982   if (nested_in_vect_loop)
2983     {
2984       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
2985          been created during vectorization of previous stmts.  We obtain it
2986          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
2987       tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2988                                            loop_preheader_edge (iv_loop));
2989       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2990     }
2991   else
2992     {
2993       /* iv_loop is the loop to be vectorized. Create:
2994          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
2995       new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2996       add_referenced_var (new_var);
2997
2998       new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2999       if (stmts)
3000         {
3001           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3002           gcc_assert (!new_bb);
3003         }
3004
3005       t = NULL_TREE;
3006       t = tree_cons (NULL_TREE, new_name, t);
3007       for (i = 1; i < nunits; i++)
3008         {
3009           /* Create: new_name_i = new_name + step_expr  */
3010           enum tree_code code = POINTER_TYPE_P (scalar_type)
3011                                 ? POINTER_PLUS_EXPR : PLUS_EXPR;
3012           init_stmt = gimple_build_assign_with_ops (code, new_var,
3013                                                     new_name, step_expr);
3014           new_name = make_ssa_name (new_var, init_stmt);
3015           gimple_assign_set_lhs (init_stmt, new_name);
3016
3017           new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3018           gcc_assert (!new_bb);
3019
3020           if (vect_print_dump_info (REPORT_DETAILS))
3021             {
3022               fprintf (vect_dump, "created new init_stmt: ");
3023               print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
3024             }
3025           t = tree_cons (NULL_TREE, new_name, t);
3026         }
3027       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3028       vec = build_constructor_from_list (vectype, nreverse (t));
3029       vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
3030     }
3031
3032
3033   /* Create the vector that holds the step of the induction.  */
3034   if (nested_in_vect_loop)
3035     /* iv_loop is nested in the loop to be vectorized. Generate:
3036        vec_step = [S, S, S, S]  */
3037     new_name = step_expr;
3038   else
3039     {
3040       /* iv_loop is the loop to be vectorized. Generate:
3041           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3042       expr = build_int_cst (TREE_TYPE (step_expr), vf);
3043       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3044                               expr, step_expr);
3045     }
3046
3047   t = unshare_expr (new_name);
3048   gcc_assert (CONSTANT_CLASS_P (new_name));
3049   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3050   gcc_assert (stepvectype);
3051   vec = build_vector_from_val (stepvectype, t);
3052   vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
3053
3054
3055   /* Create the following def-use cycle:
3056      loop prolog:
3057          vec_init = ...
3058          vec_step = ...
3059      loop:
3060          vec_iv = PHI <vec_init, vec_loop>
3061          ...
3062          STMT
3063          ...
3064          vec_loop = vec_iv + vec_step;  */
3065
3066   /* Create the induction-phi that defines the induction-operand.  */
3067   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3068   add_referenced_var (vec_dest);
3069   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3070   set_vinfo_for_stmt (induction_phi,
3071                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3072   induc_def = PHI_RESULT (induction_phi);
3073
3074   /* Create the iv update inside the loop  */
3075   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3076                                            induc_def, vec_step);
3077   vec_def = make_ssa_name (vec_dest, new_stmt);
3078   gimple_assign_set_lhs (new_stmt, vec_def);
3079   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3080   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3081                                                    NULL));
3082
3083   /* Set the arguments of the phi node:  */
3084   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3085   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3086                UNKNOWN_LOCATION);
3087
3088
3089   /* In case that vectorization factor (VF) is bigger than the number
3090      of elements that we can fit in a vectype (nunits), we have to generate
3091      more than one vector stmt - i.e - we need to "unroll" the
3092      vector stmt by a factor VF/nunits.  For more details see documentation
3093      in vectorizable_operation.  */
3094
3095   if (ncopies > 1)
3096     {
3097       stmt_vec_info prev_stmt_vinfo;
3098       /* FORNOW. This restriction should be relaxed.  */
3099       gcc_assert (!nested_in_vect_loop);
3100
3101       /* Create the vector that holds the step of the induction.  */
3102       expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3103       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3104                               expr, step_expr);
3105       t = unshare_expr (new_name);
3106       gcc_assert (CONSTANT_CLASS_P (new_name));
3107       vec = build_vector_from_val (stepvectype, t);
3108       vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
3109
3110       vec_def = induc_def;
3111       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3112       for (i = 1; i < ncopies; i++)
3113         {
3114           /* vec_i = vec_prev + vec_step  */
3115           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3116                                                    vec_def, vec_step);
3117           vec_def = make_ssa_name (vec_dest, new_stmt);
3118           gimple_assign_set_lhs (new_stmt, vec_def);
3119
3120           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3121           if (!useless_type_conversion_p (resvectype, vectype))
3122             {
3123               new_stmt = gimple_build_assign_with_ops
3124                   (VIEW_CONVERT_EXPR,
3125                    vect_get_new_vect_var (resvectype, vect_simple_var,
3126                                           "vec_iv_"),
3127                    build1 (VIEW_CONVERT_EXPR, resvectype,
3128                            gimple_assign_lhs (new_stmt)), NULL_TREE);
3129               gimple_assign_set_lhs (new_stmt,
3130                                      make_ssa_name
3131                                        (gimple_assign_lhs (new_stmt), new_stmt));
3132               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3133             }
3134           set_vinfo_for_stmt (new_stmt,
3135                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3136           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3137           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3138         }
3139     }
3140
3141   if (nested_in_vect_loop)
3142     {
3143       /* Find the loop-closed exit-phi of the induction, and record
3144          the final vector of induction results:  */
3145       exit_phi = NULL;
3146       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3147         {
3148           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3149             {
3150               exit_phi = USE_STMT (use_p);
3151               break;
3152             }
3153         }
3154       if (exit_phi)
3155         {
3156           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3157           /* FORNOW. Currently not supporting the case that an inner-loop induction
3158              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3159           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3160                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3161
3162           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3163           if (vect_print_dump_info (REPORT_DETAILS))
3164             {
3165               fprintf (vect_dump, "vector of inductions after inner-loop:");
3166               print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
3167             }
3168         }
3169     }
3170
3171
3172   if (vect_print_dump_info (REPORT_DETAILS))
3173     {
3174       fprintf (vect_dump, "transform induction: created def-use cycle: ");
3175       print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
3176       fprintf (vect_dump, "\n");
3177       print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
3178     }
3179
3180   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3181   if (!useless_type_conversion_p (resvectype, vectype))
3182     {
3183       new_stmt = gimple_build_assign_with_ops
3184          (VIEW_CONVERT_EXPR,
3185           vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3186           build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3187       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3188       gimple_assign_set_lhs (new_stmt, induc_def);
3189       si = gsi_start_bb (bb);
3190       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3191       set_vinfo_for_stmt (new_stmt,
3192                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3193       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3194         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3195     }
3196
3197   return induc_def;
3198 }
3199
3200
3201 /* Function get_initial_def_for_reduction
3202
3203    Input:
3204    STMT - a stmt that performs a reduction operation in the loop.
3205    INIT_VAL - the initial value of the reduction variable
3206
3207    Output:
3208    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3209         of the reduction (used for adjusting the epilog - see below).
3210    Return a vector variable, initialized according to the operation that STMT
3211         performs. This vector will be used as the initial value of the
3212         vector of partial results.
3213
3214    Option1 (adjust in epilog): Initialize the vector as follows:
3215      add/bit or/xor:    [0,0,...,0,0]
3216      mult/bit and:      [1,1,...,1,1]
3217      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3218    and when necessary (e.g. add/mult case) let the caller know
3219    that it needs to adjust the result by init_val.
3220
3221    Option2: Initialize the vector as follows:
3222      add/bit or/xor:    [init_val,0,0,...,0]
3223      mult/bit and:      [init_val,1,1,...,1]
3224      min/max/cond_expr: [init_val,init_val,...,init_val]
3225    and no adjustments are needed.
3226
3227    For example, for the following code:
3228
3229    s = init_val;
3230    for (i=0;i<n;i++)
3231      s = s + a[i];
3232
3233    STMT is 's = s + a[i]', and the reduction variable is 's'.
3234    For a vector of 4 units, we want to return either [0,0,0,init_val],
3235    or [0,0,0,0] and let the caller know that it needs to adjust
3236    the result at the end by 'init_val'.
3237
3238    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3239    initialization vector is simpler (same element in all entries), if
3240    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3241
3242    A cost model should help decide between these two schemes.  */
3243
3244 tree
3245 get_initial_def_for_reduction (gimple stmt, tree init_val,
3246                                tree *adjustment_def)
3247 {
3248   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3249   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3250   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3251   tree scalar_type = TREE_TYPE (init_val);
3252   tree vectype = get_vectype_for_scalar_type (scalar_type);
3253   int nunits;
3254   enum tree_code code = gimple_assign_rhs_code (stmt);
3255   tree def_for_init;
3256   tree init_def;
3257   tree t = NULL_TREE;
3258   int i;
3259   bool nested_in_vect_loop = false;
3260   tree init_value;
3261   REAL_VALUE_TYPE real_init_val = dconst0;
3262   int int_init_val = 0;
3263   gimple def_stmt = NULL;
3264
3265   gcc_assert (vectype);
3266   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3267
3268   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3269               || SCALAR_FLOAT_TYPE_P (scalar_type));
3270
3271   if (nested_in_vect_loop_p (loop, stmt))
3272     nested_in_vect_loop = true;
3273   else
3274     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3275
3276   /* In case of double reduction we only create a vector variable to be put
3277      in the reduction phi node.  The actual statement creation is done in
3278      vect_create_epilog_for_reduction.  */
3279   if (adjustment_def && nested_in_vect_loop
3280       && TREE_CODE (init_val) == SSA_NAME
3281       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3282       && gimple_code (def_stmt) == GIMPLE_PHI
3283       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3284       && vinfo_for_stmt (def_stmt)
3285       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3286           == vect_double_reduction_def)
3287     {
3288       *adjustment_def = NULL;
3289       return vect_create_destination_var (init_val, vectype);
3290     }
3291
3292   if (TREE_CONSTANT (init_val))
3293     {
3294       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3295         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3296       else
3297         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3298     }
3299   else
3300     init_value = init_val;
3301
3302   switch (code)
3303     {
3304       case WIDEN_SUM_EXPR:
3305       case DOT_PROD_EXPR:
3306       case PLUS_EXPR:
3307       case MINUS_EXPR:
3308       case BIT_IOR_EXPR:
3309       case BIT_XOR_EXPR:
3310       case MULT_EXPR:
3311       case BIT_AND_EXPR:
3312         /* ADJUSMENT_DEF is NULL when called from
3313            vect_create_epilog_for_reduction to vectorize double reduction.  */
3314         if (adjustment_def)
3315           {
3316             if (nested_in_vect_loop)
3317               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3318                                                               NULL);
3319             else
3320               *adjustment_def = init_val;
3321           }
3322
3323         if (code == MULT_EXPR)
3324           {
3325             real_init_val = dconst1;
3326             int_init_val = 1;
3327           }
3328
3329         if (code == BIT_AND_EXPR)
3330           int_init_val = -1;
3331
3332         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3333           def_for_init = build_real (scalar_type, real_init_val);
3334         else
3335           def_for_init = build_int_cst (scalar_type, int_init_val);
3336
3337         /* Create a vector of '0' or '1' except the first element.  */
3338         for (i = nunits - 2; i >= 0; --i)
3339           t = tree_cons (NULL_TREE, def_for_init, t);
3340
3341         /* Option1: the first element is '0' or '1' as well.  */
3342         if (adjustment_def)
3343           {
3344             t = tree_cons (NULL_TREE, def_for_init, t);
3345             init_def = build_vector (vectype, t);
3346             break;
3347           }
3348
3349         /* Option2: the first element is INIT_VAL.  */
3350         t = tree_cons (NULL_TREE, init_value, t);
3351         if (TREE_CONSTANT (init_val))
3352           init_def = build_vector (vectype, t);
3353         else
3354           init_def = build_constructor_from_list (vectype, t);
3355
3356         break;
3357
3358       case MIN_EXPR:
3359       case MAX_EXPR:
3360       case COND_EXPR:
3361         if (adjustment_def)
3362           {
3363             *adjustment_def = NULL_TREE;
3364             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3365             break;
3366           }
3367
3368         init_def = build_vector_from_val (vectype, init_value);
3369         break;
3370
3371       default:
3372         gcc_unreachable ();
3373     }
3374
3375   return init_def;
3376 }
3377
3378
3379 /* Function vect_create_epilog_for_reduction
3380
3381    Create code at the loop-epilog to finalize the result of a reduction
3382    computation.
3383
3384    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3385      reduction statements.
3386    STMT is the scalar reduction stmt that is being vectorized.
3387    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3388      number of elements that we can fit in a vectype (nunits).  In this case
3389      we have to generate more than one vector stmt - i.e - we need to "unroll"
3390      the vector stmt by a factor VF/nunits.  For more details see documentation
3391      in vectorizable_operation.
3392    REDUC_CODE is the tree-code for the epilog reduction.
3393    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3394      computation.
3395    REDUC_INDEX is the index of the operand in the right hand side of the
3396      statement that is defined by REDUCTION_PHI.
3397    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3398    SLP_NODE is an SLP node containing a group of reduction statements. The
3399      first one in this group is STMT.
3400
3401    This function:
3402    1. Creates the reduction def-use cycles: sets the arguments for
3403       REDUCTION_PHIS:
3404       The loop-entry argument is the vectorized initial-value of the reduction.
3405       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3406       sums.
3407    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3408       by applying the operation specified by REDUC_CODE if available, or by
3409       other means (whole-vector shifts or a scalar loop).
3410       The function also creates a new phi node at the loop exit to preserve
3411       loop-closed form, as illustrated below.
3412
3413      The flow at the entry to this function:
3414
3415         loop:
3416           vec_def = phi <null, null>            # REDUCTION_PHI
3417           VECT_DEF = vector_stmt                # vectorized form of STMT
3418           s_loop = scalar_stmt                  # (scalar) STMT
3419         loop_exit:
3420           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3421           use <s_out0>
3422           use <s_out0>
3423
3424      The above is transformed by this function into:
3425
3426         loop:
3427           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3428           VECT_DEF = vector_stmt                # vectorized form of STMT
3429           s_loop = scalar_stmt                  # (scalar) STMT
3430         loop_exit:
3431           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3432           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3433           v_out2 = reduce <v_out1>
3434           s_out3 = extract_field <v_out2, 0>
3435           s_out4 = adjust_result <s_out3>
3436           use <s_out4>
3437           use <s_out4>
3438 */
3439
3440 static void
3441 vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
3442                                   int ncopies, enum tree_code reduc_code,
3443                                   VEC (gimple, heap) *reduction_phis,
3444                                   int reduc_index, bool double_reduc,
3445                                   slp_tree slp_node)
3446 {
3447   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3448   stmt_vec_info prev_phi_info;
3449   tree vectype;
3450   enum machine_mode mode;
3451   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3452   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3453   basic_block exit_bb;
3454   tree scalar_dest;
3455   tree scalar_type;
3456   gimple new_phi = NULL, phi;
3457   gimple_stmt_iterator exit_gsi;
3458   tree vec_dest;
3459   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3460   gimple epilog_stmt = NULL;
3461   enum tree_code code = gimple_assign_rhs_code (stmt);
3462   gimple exit_phi;
3463   tree bitsize, bitpos;
3464   tree adjustment_def = NULL;
3465   tree vec_initial_def = NULL;
3466   tree reduction_op, expr, def;
3467   tree orig_name, scalar_result;
3468   imm_use_iterator imm_iter, phi_imm_iter;
3469   use_operand_p use_p, phi_use_p;
3470   bool extract_scalar_result = false;
3471   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3472   bool nested_in_vect_loop = false;
3473   VEC (gimple, heap) *new_phis = NULL;
3474   enum vect_def_type dt = vect_unknown_def_type;
3475   int j, i;
3476   VEC (tree, heap) *scalar_results = NULL;
3477   unsigned int group_size = 1, k, ratio;
3478   VEC (tree, heap) *vec_initial_defs = NULL;
3479   VEC (gimple, heap) *phis;
3480   bool slp_reduc = false;
3481   tree new_phi_result;
3482
3483   if (slp_node)
3484     group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
3485
3486   if (nested_in_vect_loop_p (loop, stmt))
3487     {
3488       outer_loop = loop;
3489       loop = loop->inner;
3490       nested_in_vect_loop = true;
3491       gcc_assert (!slp_node);
3492     }
3493
3494   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3495     {
3496     case GIMPLE_SINGLE_RHS:
3497       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3498                   == ternary_op);
3499       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3500       break;
3501     case GIMPLE_UNARY_RHS:
3502       reduction_op = gimple_assign_rhs1 (stmt);
3503       break;
3504     case GIMPLE_BINARY_RHS:
3505       reduction_op = reduc_index ?
3506                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3507       break;
3508     case GIMPLE_TERNARY_RHS:
3509       reduction_op = gimple_op (stmt, reduc_index + 1);
3510       break;
3511     default:
3512       gcc_unreachable ();
3513     }
3514
3515   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3516   gcc_assert (vectype);
3517   mode = TYPE_MODE (vectype);
3518
3519   /* 1. Create the reduction def-use cycle:
3520      Set the arguments of REDUCTION_PHIS, i.e., transform
3521
3522         loop:
3523           vec_def = phi <null, null>            # REDUCTION_PHI
3524           VECT_DEF = vector_stmt                # vectorized form of STMT
3525           ...
3526
3527      into:
3528
3529         loop:
3530           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3531           VECT_DEF = vector_stmt                # vectorized form of STMT
3532           ...
3533
3534      (in case of SLP, do it for all the phis). */
3535
3536   /* Get the loop-entry arguments.  */
3537   if (slp_node)
3538     vect_get_slp_defs (reduction_op, NULL_TREE, slp_node, &vec_initial_defs,
3539                        NULL, reduc_index);
3540   else
3541     {
3542       vec_initial_defs = VEC_alloc (tree, heap, 1);
3543      /* For the case of reduction, vect_get_vec_def_for_operand returns
3544         the scalar def before the loop, that defines the initial value
3545         of the reduction variable.  */
3546       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3547                                                       &adjustment_def);
3548       VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
3549     }
3550
3551   /* Set phi nodes arguments.  */
3552   FOR_EACH_VEC_ELT (gimple, reduction_phis, i, phi)
3553     {
3554       tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
3555       tree def = VEC_index (tree, vect_defs, i);
3556       for (j = 0; j < ncopies; j++)
3557         {
3558           /* Set the loop-entry arg of the reduction-phi.  */
3559           add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3560                        UNKNOWN_LOCATION);
3561
3562           /* Set the loop-latch arg for the reduction-phi.  */
3563           if (j > 0)
3564             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3565
3566           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3567
3568           if (vect_print_dump_info (REPORT_DETAILS))
3569             {
3570               fprintf (vect_dump, "transform reduction: created def-use"
3571                                   " cycle: ");
3572               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
3573               fprintf (vect_dump, "\n");
3574               print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0,
3575                                  TDF_SLIM);
3576             }
3577
3578           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3579         }
3580     }
3581
3582   VEC_free (tree, heap, vec_initial_defs);
3583
3584   /* 2. Create epilog code.
3585         The reduction epilog code operates across the elements of the vector
3586         of partial results computed by the vectorized loop.
3587         The reduction epilog code consists of:
3588
3589         step 1: compute the scalar result in a vector (v_out2)
3590         step 2: extract the scalar result (s_out3) from the vector (v_out2)
3591         step 3: adjust the scalar result (s_out3) if needed.
3592
3593         Step 1 can be accomplished using one the following three schemes:
3594           (scheme 1) using reduc_code, if available.
3595           (scheme 2) using whole-vector shifts, if available.
3596           (scheme 3) using a scalar loop. In this case steps 1+2 above are
3597                      combined.
3598
3599           The overall epilog code looks like this:
3600
3601           s_out0 = phi <s_loop>         # original EXIT_PHI
3602           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
3603           v_out2 = reduce <v_out1>              # step 1
3604           s_out3 = extract_field <v_out2, 0>    # step 2
3605           s_out4 = adjust_result <s_out3>       # step 3
3606
3607           (step 3 is optional, and steps 1 and 2 may be combined).
3608           Lastly, the uses of s_out0 are replaced by s_out4.  */
3609
3610
3611   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3612          v_out1 = phi <VECT_DEF>
3613          Store them in NEW_PHIS.  */
3614
3615   exit_bb = single_exit (loop)->dest;
3616   prev_phi_info = NULL;
3617   new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3618   FOR_EACH_VEC_ELT (tree, vect_defs, i, def)
3619     {
3620       for (j = 0; j < ncopies; j++)
3621         {
3622           phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
3623           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3624           if (j == 0)
3625             VEC_quick_push (gimple, new_phis, phi);
3626           else
3627             {
3628               def = vect_get_vec_def_for_stmt_copy (dt, def);
3629               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3630             }
3631
3632           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3633           prev_phi_info = vinfo_for_stmt (phi);
3634         }
3635     }
3636
3637   /* The epilogue is created for the outer-loop, i.e., for the loop being
3638      vectorized.  */
3639   if (double_reduc)
3640     {
3641       loop = outer_loop;
3642       exit_bb = single_exit (loop)->dest;
3643     }
3644
3645   exit_gsi = gsi_after_labels (exit_bb);
3646
3647   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3648          (i.e. when reduc_code is not available) and in the final adjustment
3649          code (if needed).  Also get the original scalar reduction variable as
3650          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
3651          represents a reduction pattern), the tree-code and scalar-def are
3652          taken from the original stmt that the pattern-stmt (STMT) replaces.
3653          Otherwise (it is a regular reduction) - the tree-code and scalar-def
3654          are taken from STMT.  */
3655
3656   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3657   if (!orig_stmt)
3658     {
3659       /* Regular reduction  */
3660       orig_stmt = stmt;
3661     }
3662   else
3663     {
3664       /* Reduction pattern  */
3665       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3666       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3667       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3668     }
3669
3670   code = gimple_assign_rhs_code (orig_stmt);
3671   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3672      partial results are added and not subtracted.  */
3673   if (code == MINUS_EXPR)
3674     code = PLUS_EXPR;
3675
3676   scalar_dest = gimple_assign_lhs (orig_stmt);
3677   scalar_type = TREE_TYPE (scalar_dest);
3678   scalar_results = VEC_alloc (tree, heap, group_size);
3679   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3680   bitsize = TYPE_SIZE (scalar_type);
3681
3682   /* In case this is a reduction in an inner-loop while vectorizing an outer
3683      loop - we don't need to extract a single scalar result at the end of the
3684      inner-loop (unless it is double reduction, i.e., the use of reduction is
3685      outside the outer-loop).  The final vector of partial results will be used
3686      in the vectorized outer-loop, or reduced to a scalar result at the end of
3687      the outer-loop.  */
3688   if (nested_in_vect_loop && !double_reduc)
3689     goto vect_finalize_reduction;
3690
3691   /* SLP reduction without reduction chain, e.g.,
3692      # a1 = phi <a2, a0>
3693      # b1 = phi <b2, b0>
3694      a2 = operation (a1)
3695      b2 = operation (b1)  */
3696   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
3697
3698   /* In case of reduction chain, e.g.,
3699      # a1 = phi <a3, a0>
3700      a2 = operation (a1)
3701      a3 = operation (a2),
3702
3703      we may end up with more than one vector result.  Here we reduce them to
3704      one vector.  */
3705   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
3706     {
3707       tree first_vect = PHI_RESULT (VEC_index (gimple, new_phis, 0));
3708       tree tmp;
3709       gimple new_vec_stmt = NULL;
3710
3711       vec_dest = vect_create_destination_var (scalar_dest, vectype);
3712       for (k = 1; k < VEC_length (gimple, new_phis); k++)
3713         {
3714           gimple next_phi = VEC_index (gimple, new_phis, k);
3715           tree second_vect = PHI_RESULT (next_phi);
3716
3717           tmp = build2 (code, vectype,  first_vect, second_vect);
3718           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
3719           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
3720           gimple_assign_set_lhs (new_vec_stmt, first_vect);
3721           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
3722         }
3723
3724       new_phi_result = first_vect;
3725       if (new_vec_stmt)
3726         {
3727           VEC_truncate (gimple, new_phis, 0);
3728           VEC_safe_push (gimple, heap, new_phis, new_vec_stmt);
3729         }
3730     }
3731   else
3732     new_phi_result = PHI_RESULT (VEC_index (gimple, new_phis, 0));
3733
3734   /* 2.3 Create the reduction code, using one of the three schemes described
3735          above. In SLP we simply need to extract all the elements from the
3736          vector (without reducing them), so we use scalar shifts.  */
3737   if (reduc_code != ERROR_MARK && !slp_reduc)
3738     {
3739       tree tmp;
3740
3741       /*** Case 1:  Create:
3742            v_out2 = reduc_expr <v_out1>  */
3743
3744       if (vect_print_dump_info (REPORT_DETAILS))
3745         fprintf (vect_dump, "Reduce using direct vector reduction.");
3746
3747       vec_dest = vect_create_destination_var (scalar_dest, vectype);
3748       tmp = build1 (reduc_code, vectype, new_phi_result);
3749       epilog_stmt = gimple_build_assign (vec_dest, tmp);
3750       new_temp = make_ssa_name (vec_dest, epilog_stmt);
3751       gimple_assign_set_lhs (epilog_stmt, new_temp);
3752       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3753
3754       extract_scalar_result = true;
3755     }
3756   else
3757     {
3758       enum tree_code shift_code = ERROR_MARK;
3759       bool have_whole_vector_shift = true;
3760       int bit_offset;
3761       int element_bitsize = tree_low_cst (bitsize, 1);
3762       int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3763       tree vec_temp;
3764
3765       if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3766         shift_code = VEC_RSHIFT_EXPR;
3767       else
3768         have_whole_vector_shift = false;
3769
3770       /* Regardless of whether we have a whole vector shift, if we're
3771          emulating the operation via tree-vect-generic, we don't want
3772          to use it.  Only the first round of the reduction is likely
3773          to still be profitable via emulation.  */
3774       /* ??? It might be better to emit a reduction tree code here, so that
3775          tree-vect-generic can expand the first round via bit tricks.  */
3776       if (!VECTOR_MODE_P (mode))
3777         have_whole_vector_shift = false;
3778       else
3779         {
3780           optab optab = optab_for_tree_code (code, vectype, optab_default);
3781           if (optab_handler (optab, mode) == CODE_FOR_nothing)
3782             have_whole_vector_shift = false;
3783         }
3784
3785       if (have_whole_vector_shift && !slp_reduc)
3786         {
3787           /*** Case 2: Create:
3788              for (offset = VS/2; offset >= element_size; offset/=2)
3789                 {
3790                   Create:  va' = vec_shift <va, offset>
3791                   Create:  va = vop <va, va'>
3792                 }  */
3793
3794           if (vect_print_dump_info (REPORT_DETAILS))
3795             fprintf (vect_dump, "Reduce using vector shifts");
3796
3797           vec_dest = vect_create_destination_var (scalar_dest, vectype);
3798           new_temp = new_phi_result;
3799           for (bit_offset = vec_size_in_bits/2;
3800                bit_offset >= element_bitsize;
3801                bit_offset /= 2)
3802             {
3803               tree bitpos = size_int (bit_offset);
3804
3805               epilog_stmt = gimple_build_assign_with_ops (shift_code,
3806                                                vec_dest, new_temp, bitpos);
3807               new_name = make_ssa_name (vec_dest, epilog_stmt);
3808               gimple_assign_set_lhs (epilog_stmt, new_name);
3809               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3810
3811               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3812                                                           new_name, new_temp);
3813               new_temp = make_ssa_name (vec_dest, epilog_stmt);
3814               gimple_assign_set_lhs (epilog_stmt, new_temp);
3815               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3816             }
3817
3818           extract_scalar_result = true;
3819         }
3820       else
3821         {
3822           tree rhs;
3823
3824           /*** Case 3: Create:
3825              s = extract_field <v_out2, 0>
3826              for (offset = element_size;
3827                   offset < vector_size;
3828                   offset += element_size;)
3829                {
3830                  Create:  s' = extract_field <v_out2, offset>
3831                  Create:  s = op <s, s'>  // For non SLP cases
3832                }  */
3833
3834           if (vect_print_dump_info (REPORT_DETAILS))
3835             fprintf (vect_dump, "Reduce using scalar code. ");
3836
3837           vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3838           FOR_EACH_VEC_ELT (gimple, new_phis, i, new_phi)
3839             {
3840               if (gimple_code (new_phi) == GIMPLE_PHI)
3841                 vec_temp = PHI_RESULT (new_phi);
3842               else
3843                 vec_temp = gimple_assign_lhs (new_phi);
3844               rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3845                             bitsize_zero_node);
3846               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3847               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3848               gimple_assign_set_lhs (epilog_stmt, new_temp);
3849               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3850
3851               /* In SLP we don't need to apply reduction operation, so we just
3852                  collect s' values in SCALAR_RESULTS.  */
3853               if (slp_reduc)
3854                 VEC_safe_push (tree, heap, scalar_results, new_temp);
3855
3856               for (bit_offset = element_bitsize;
3857                    bit_offset < vec_size_in_bits;
3858                    bit_offset += element_bitsize)
3859                 {
3860                   tree bitpos = bitsize_int (bit_offset);
3861                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
3862                                      bitsize, bitpos);
3863
3864                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3865                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3866                   gimple_assign_set_lhs (epilog_stmt, new_name);
3867                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3868
3869                   if (slp_reduc)
3870                     {
3871                       /* In SLP we don't need to apply reduction operation, so
3872                          we just collect s' values in SCALAR_RESULTS.  */
3873                       new_temp = new_name;
3874                       VEC_safe_push (tree, heap, scalar_results, new_name);
3875                     }
3876                   else
3877                     {
3878                       epilog_stmt = gimple_build_assign_with_ops (code,
3879                                           new_scalar_dest, new_name, new_temp);
3880                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3881                       gimple_assign_set_lhs (epilog_stmt, new_temp);
3882                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3883                     }
3884                 }
3885             }
3886
3887           /* The only case where we need to reduce scalar results in SLP, is
3888              unrolling.  If the size of SCALAR_RESULTS is greater than
3889              GROUP_SIZE, we reduce them combining elements modulo
3890              GROUP_SIZE.  */
3891           if (slp_reduc)
3892             {
3893               tree res, first_res, new_res;
3894               gimple new_stmt;
3895
3896               /* Reduce multiple scalar results in case of SLP unrolling.  */
3897               for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
3898                    j++)
3899                 {
3900                   first_res = VEC_index (tree, scalar_results, j % group_size);
3901                   new_stmt = gimple_build_assign_with_ops (code,
3902                                               new_scalar_dest, first_res, res);
3903                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
3904                   gimple_assign_set_lhs (new_stmt, new_res);
3905                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
3906                   VEC_replace (tree, scalar_results, j % group_size, new_res);
3907                 }
3908             }
3909           else
3910             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
3911             VEC_safe_push (tree, heap, scalar_results, new_temp);
3912
3913           extract_scalar_result = false;
3914         }
3915     }
3916
3917   /* 2.4  Extract the final scalar result.  Create:
3918           s_out3 = extract_field <v_out2, bitpos>  */
3919
3920   if (extract_scalar_result)
3921     {
3922       tree rhs;
3923
3924       if (vect_print_dump_info (REPORT_DETAILS))
3925         fprintf (vect_dump, "extract scalar result");
3926
3927       if (BYTES_BIG_ENDIAN)
3928         bitpos = size_binop (MULT_EXPR,
3929                              bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3930                              TYPE_SIZE (scalar_type));
3931       else
3932         bitpos = bitsize_zero_node;
3933
3934       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3935       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3936       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3937       gimple_assign_set_lhs (epilog_stmt, new_temp);
3938       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3939       VEC_safe_push (tree, heap, scalar_results, new_temp);
3940     }
3941
3942 vect_finalize_reduction:
3943
3944   if (double_reduc)
3945     loop = loop->inner;
3946
3947   /* 2.5 Adjust the final result by the initial value of the reduction
3948          variable. (When such adjustment is not needed, then
3949          'adjustment_def' is zero).  For example, if code is PLUS we create:
3950          new_temp = loop_exit_def + adjustment_def  */
3951
3952   if (adjustment_def)
3953     {
3954       gcc_assert (!slp_reduc);
3955       if (nested_in_vect_loop)
3956         {
3957           new_phi = VEC_index (gimple, new_phis, 0);
3958           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3959           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3960           new_dest = vect_create_destination_var (scalar_dest, vectype);
3961         }
3962       else
3963         {
3964           new_temp = VEC_index (tree, scalar_results, 0);
3965           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3966           expr = build2 (code, scalar_type, new_temp, adjustment_def);
3967           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3968         }
3969
3970       epilog_stmt = gimple_build_assign (new_dest, expr);
3971       new_temp = make_ssa_name (new_dest, epilog_stmt);
3972       gimple_assign_set_lhs (epilog_stmt, new_temp);
3973       SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3974       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3975       if (nested_in_vect_loop)
3976         {
3977           set_vinfo_for_stmt (epilog_stmt,
3978                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
3979                                                  NULL));
3980           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3981                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3982
3983           if (!double_reduc)
3984             VEC_quick_push (tree, scalar_results, new_temp);
3985           else
3986             VEC_replace (tree, scalar_results, 0, new_temp);
3987         }
3988       else
3989         VEC_replace (tree, scalar_results, 0, new_temp);
3990
3991       VEC_replace (gimple, new_phis, 0, epilog_stmt);
3992     }
3993
3994   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
3995           phis with new adjusted scalar results, i.e., replace use <s_out0>
3996           with use <s_out4>.
3997
3998      Transform:
3999         loop_exit:
4000           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4001           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4002           v_out2 = reduce <v_out1>
4003           s_out3 = extract_field <v_out2, 0>
4004           s_out4 = adjust_result <s_out3>
4005           use <s_out0>
4006           use <s_out0>
4007
4008      into:
4009
4010         loop_exit:
4011           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4012           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4013           v_out2 = reduce <v_out1>
4014           s_out3 = extract_field <v_out2, 0>
4015           s_out4 = adjust_result <s_out3>
4016           use <s_out4>
4017           use <s_out4> */
4018
4019
4020   /* In SLP reduction chain we reduce vector results into one vector if
4021      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4022      the last stmt in the reduction chain, since we are looking for the loop
4023      exit phi node.  */
4024   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4025     {
4026       scalar_dest = gimple_assign_lhs (VEC_index (gimple,
4027                                        SLP_TREE_SCALAR_STMTS (slp_node),
4028                                        group_size - 1));
4029       group_size = 1;
4030     }
4031
4032   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4033      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4034      need to match SCALAR_RESULTS with corresponding statements.  The first
4035      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4036      the first vector stmt, etc.
4037      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4038   if (group_size > VEC_length (gimple, new_phis))
4039     {
4040       ratio = group_size / VEC_length (gimple, new_phis);
4041       gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
4042     }
4043   else
4044     ratio = 1;
4045
4046   for (k = 0; k < group_size; k++)
4047     {
4048       if (k % ratio == 0)
4049         {
4050           epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
4051           reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
4052         }
4053
4054       if (slp_reduc)
4055         {
4056           gimple current_stmt = VEC_index (gimple,
4057                                        SLP_TREE_SCALAR_STMTS (slp_node), k);
4058
4059           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4060           /* SLP statements can't participate in patterns.  */
4061           gcc_assert (!orig_stmt);
4062           scalar_dest = gimple_assign_lhs (current_stmt);
4063         }
4064
4065       phis = VEC_alloc (gimple, heap, 3);
4066       /* Find the loop-closed-use at the loop exit of the original scalar
4067          result.  (The reduction result is expected to have two immediate uses -
4068          one at the latch block, and one at the loop exit).  */
4069       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4070         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4071           VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
4072
4073       /* We expect to have found an exit_phi because of loop-closed-ssa
4074          form.  */
4075       gcc_assert (!VEC_empty (gimple, phis));
4076
4077       FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
4078         {
4079           if (outer_loop)
4080             {
4081               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4082               gimple vect_phi;
4083
4084               /* FORNOW. Currently not supporting the case that an inner-loop
4085                  reduction is not used in the outer-loop (but only outside the
4086                  outer-loop), unless it is double reduction.  */
4087               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4088                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4089                           || double_reduc);
4090
4091               STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4092               if (!double_reduc
4093                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4094                       != vect_double_reduction_def)
4095                 continue;
4096
4097               /* Handle double reduction:
4098
4099                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4100                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4101                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4102                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4103
4104                  At that point the regular reduction (stmt2 and stmt3) is
4105                  already vectorized, as well as the exit phi node, stmt4.
4106                  Here we vectorize the phi node of double reduction, stmt1, and
4107                  update all relevant statements.  */
4108
4109               /* Go through all the uses of s2 to find double reduction phi
4110                  node, i.e., stmt1 above.  */
4111               orig_name = PHI_RESULT (exit_phi);
4112               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4113                 {
4114                   stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4115                   stmt_vec_info new_phi_vinfo;
4116                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4117                   basic_block bb = gimple_bb (use_stmt);
4118                   gimple use;
4119
4120                   /* Check that USE_STMT is really double reduction phi
4121                      node.  */
4122                   if (gimple_code (use_stmt) != GIMPLE_PHI
4123                       || gimple_phi_num_args (use_stmt) != 2
4124                       || !use_stmt_vinfo
4125                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4126                           != vect_double_reduction_def
4127                       || bb->loop_father != outer_loop)
4128                     continue;
4129
4130                   /* Create vector phi node for double reduction:
4131                      vs1 = phi <vs0, vs2>
4132                      vs1 was created previously in this function by a call to
4133                        vect_get_vec_def_for_operand and is stored in
4134                        vec_initial_def;
4135                      vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
4136                      vs0 is created here.  */
4137
4138                   /* Create vector phi node.  */
4139                   vect_phi = create_phi_node (vec_initial_def, bb);
4140                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4141                                     loop_vec_info_for_loop (outer_loop), NULL);
4142                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4143
4144                   /* Create vs0 - initial def of the double reduction phi.  */
4145                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4146                                              loop_preheader_edge (outer_loop));
4147                   init_def = get_initial_def_for_reduction (stmt,
4148                                                           preheader_arg, NULL);
4149                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4150                                                     vectype, NULL);
4151
4152                   /* Update phi node arguments with vs0 and vs2.  */
4153                   add_phi_arg (vect_phi, vect_phi_init,
4154                                loop_preheader_edge (outer_loop),
4155                                UNKNOWN_LOCATION);
4156                   add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
4157                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4158                   if (vect_print_dump_info (REPORT_DETAILS))
4159                     {
4160                       fprintf (vect_dump, "created double reduction phi "
4161                                           "node: ");
4162                       print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
4163                     }
4164
4165                   vect_phi_res = PHI_RESULT (vect_phi);
4166
4167                   /* Replace the use, i.e., set the correct vs1 in the regular
4168                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4169                      loop is redundant.  */
4170                   use = reduction_phi;
4171                   for (j = 0; j < ncopies; j++)
4172                     {
4173                       edge pr_edge = loop_preheader_edge (loop);
4174                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4175                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4176                     }
4177                 }
4178             }
4179         }
4180
4181       VEC_free (gimple, heap, phis);
4182       if (nested_in_vect_loop)
4183         {
4184           if (double_reduc)
4185             loop = outer_loop;
4186           else
4187             continue;
4188         }
4189
4190       phis = VEC_alloc (gimple, heap, 3);
4191       /* Find the loop-closed-use at the loop exit of the original scalar
4192          result.  (The reduction result is expected to have two immediate uses,
4193          one at the latch block, and one at the loop exit).  For double
4194          reductions we are looking for exit phis of the outer loop.  */
4195       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4196         {
4197           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4198             VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
4199           else
4200             {
4201               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4202                 {
4203                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4204
4205                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4206                     {
4207                       if (!flow_bb_inside_loop_p (loop,
4208                                              gimple_bb (USE_STMT (phi_use_p))))
4209                         VEC_safe_push (gimple, heap, phis,
4210                                        USE_STMT (phi_use_p));
4211                     }
4212                 }
4213             }
4214         }
4215
4216       FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
4217         {
4218           /* Replace the uses:  */
4219           orig_name = PHI_RESULT (exit_phi);
4220           scalar_result = VEC_index (tree, scalar_results, k);
4221           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4222             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4223               SET_USE (use_p, scalar_result);
4224         }
4225
4226       VEC_free (gimple, heap, phis);
4227     }
4228
4229   VEC_free (tree, heap, scalar_results);
4230   VEC_free (gimple, heap, new_phis);
4231 }
4232
4233
4234 /* Function vectorizable_reduction.
4235
4236    Check if STMT performs a reduction operation that can be vectorized.
4237    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4238    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4239    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4240
4241    This function also handles reduction idioms (patterns) that have been
4242    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4243    of this form:
4244      X = pattern_expr (arg0, arg1, ..., X)
4245    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4246    sequence that had been detected and replaced by the pattern-stmt (STMT).
4247
4248    In some cases of reduction patterns, the type of the reduction variable X is
4249    different than the type of the other arguments of STMT.
4250    In such cases, the vectype that is used when transforming STMT into a vector
4251    stmt is different than the vectype that is used to determine the
4252    vectorization factor, because it consists of a different number of elements
4253    than the actual number of elements that are being operated upon in parallel.
4254
4255    For example, consider an accumulation of shorts into an int accumulator.
4256    On some targets it's possible to vectorize this pattern operating on 8
4257    shorts at a time (hence, the vectype for purposes of determining the
4258    vectorization factor should be V8HI); on the other hand, the vectype that
4259    is used to create the vector form is actually V4SI (the type of the result).
4260
4261    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4262    indicates what is the actual level of parallelism (V8HI in the example), so
4263    that the right vectorization factor would be derived.  This vectype
4264    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4265    be used to create the vectorized stmt.  The right vectype for the vectorized
4266    stmt is obtained from the type of the result X:
4267         get_vectype_for_scalar_type (TREE_TYPE (X))
4268
4269    This means that, contrary to "regular" reductions (or "regular" stmts in
4270    general), the following equation:
4271       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4272    does *NOT* necessarily hold for reduction patterns.  */
4273
4274 bool
4275 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4276                         gimple *vec_stmt, slp_tree slp_node)
4277 {
4278   tree vec_dest;
4279   tree scalar_dest;
4280   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4281   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4282   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4283   tree vectype_in = NULL_TREE;
4284   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4285   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4286   enum tree_code code, orig_code, epilog_reduc_code;
4287   enum machine_mode vec_mode;
4288   int op_type;
4289   optab optab, reduc_optab;
4290   tree new_temp = NULL_TREE;
4291   tree def;
4292   gimple def_stmt;
4293   enum vect_def_type dt;
4294   gimple new_phi = NULL;
4295   tree scalar_type;
4296   bool is_simple_use;
4297   gimple orig_stmt;
4298   stmt_vec_info orig_stmt_info;
4299   tree expr = NULL_TREE;
4300   int i;
4301   int ncopies;
4302   int epilog_copies;
4303   stmt_vec_info prev_stmt_info, prev_phi_info;
4304   bool single_defuse_cycle = false;
4305   tree reduc_def = NULL_TREE;
4306   gimple new_stmt = NULL;
4307   int j;
4308   tree ops[3];
4309   bool nested_cycle = false, found_nested_cycle_def = false;
4310   gimple reduc_def_stmt = NULL;
4311   /* The default is that the reduction variable is the last in statement.  */
4312   int reduc_index = 2;
4313   bool double_reduc = false, dummy;
4314   basic_block def_bb;
4315   struct loop * def_stmt_loop, *outer_loop = NULL;
4316   tree def_arg;
4317   gimple def_arg_stmt;
4318   VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
4319   VEC (gimple, heap) *phis = NULL;
4320   int vec_num;
4321   tree def0, def1, tem, op0, op1 = NULL_TREE;
4322
4323   /* In case of reduction chain we switch to the first stmt in the chain, but
4324      we don't update STMT_INFO, since only the last stmt is marked as reduction
4325      and has reduction properties.  */
4326   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4327     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4328
4329   if (nested_in_vect_loop_p (loop, stmt))
4330     {
4331       outer_loop = loop;
4332       loop = loop->inner;
4333       nested_cycle = true;
4334     }
4335
4336   /* 1. Is vectorizable reduction?  */
4337   /* Not supportable if the reduction variable is used in the loop, unless
4338      it's a reduction chain.  */
4339   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4340       && !GROUP_FIRST_ELEMENT (stmt_info))
4341     return false;
4342
4343   /* Reductions that are not used even in an enclosing outer-loop,
4344      are expected to be "live" (used out of the loop).  */
4345   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4346       && !STMT_VINFO_LIVE_P (stmt_info))
4347     return false;
4348
4349   /* Make sure it was already recognized as a reduction computation.  */
4350   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4351       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4352     return false;
4353
4354   /* 2. Has this been recognized as a reduction pattern?
4355
4356      Check if STMT represents a pattern that has been recognized
4357      in earlier analysis stages.  For stmts that represent a pattern,
4358      the STMT_VINFO_RELATED_STMT field records the last stmt in
4359      the original sequence that constitutes the pattern.  */
4360
4361   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4362   if (orig_stmt)
4363     {
4364       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4365       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
4366       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4367       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4368     }
4369
4370   /* 3. Check the operands of the operation.  The first operands are defined
4371         inside the loop body. The last operand is the reduction variable,
4372         which is defined by the loop-header-phi.  */
4373
4374   gcc_assert (is_gimple_assign (stmt));
4375
4376   /* Flatten RHS.  */
4377   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4378     {
4379     case GIMPLE_SINGLE_RHS:
4380       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4381       if (op_type == ternary_op)
4382         {
4383           tree rhs = gimple_assign_rhs1 (stmt);
4384           ops[0] = TREE_OPERAND (rhs, 0);
4385           ops[1] = TREE_OPERAND (rhs, 1);
4386           ops[2] = TREE_OPERAND (rhs, 2);
4387           code = TREE_CODE (rhs);
4388         }
4389       else
4390         return false;
4391       break;
4392
4393     case GIMPLE_BINARY_RHS:
4394       code = gimple_assign_rhs_code (stmt);
4395       op_type = TREE_CODE_LENGTH (code);
4396       gcc_assert (op_type == binary_op);
4397       ops[0] = gimple_assign_rhs1 (stmt);
4398       ops[1] = gimple_assign_rhs2 (stmt);
4399       break;
4400
4401     case GIMPLE_TERNARY_RHS:
4402       code = gimple_assign_rhs_code (stmt);
4403       op_type = TREE_CODE_LENGTH (code);
4404       gcc_assert (op_type == ternary_op);
4405       ops[0] = gimple_assign_rhs1 (stmt);
4406       ops[1] = gimple_assign_rhs2 (stmt);
4407       ops[2] = gimple_assign_rhs3 (stmt);
4408       break;
4409
4410     case GIMPLE_UNARY_RHS:
4411       return false;
4412
4413     default:
4414       gcc_unreachable ();
4415     }
4416
4417   scalar_dest = gimple_assign_lhs (stmt);
4418   scalar_type = TREE_TYPE (scalar_dest);
4419   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4420       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4421     return false;
4422
4423   /* All uses but the last are expected to be defined in the loop.
4424      The last use is the reduction variable.  In case of nested cycle this
4425      assumption is not true: we use reduc_index to record the index of the
4426      reduction variable.  */
4427   for (i = 0; i < op_type-1; i++)
4428     {
4429       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4430       if (i == 0 && code == COND_EXPR)
4431         continue;
4432
4433       is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL,
4434                                             &def_stmt, &def, &dt, &tem);
4435       if (!vectype_in)
4436         vectype_in = tem;
4437       gcc_assert (is_simple_use);
4438
4439       if (dt != vect_internal_def
4440           && dt != vect_external_def
4441           && dt != vect_constant_def
4442           && dt != vect_induction_def
4443           && !(dt == vect_nested_cycle && nested_cycle))
4444         return false;
4445
4446       if (dt == vect_nested_cycle)
4447         {
4448           found_nested_cycle_def = true;
4449           reduc_def_stmt = def_stmt;
4450           reduc_index = i;
4451         }
4452     }
4453
4454   is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL, &def_stmt,
4455                                         &def, &dt, &tem);
4456   if (!vectype_in)
4457     vectype_in = tem;
4458   gcc_assert (is_simple_use);
4459   gcc_assert (dt == vect_reduction_def
4460               || dt == vect_nested_cycle
4461               || ((dt == vect_internal_def || dt == vect_external_def
4462                    || dt == vect_constant_def || dt == vect_induction_def)
4463                    && nested_cycle && found_nested_cycle_def));
4464   if (!found_nested_cycle_def)
4465     reduc_def_stmt = def_stmt;
4466
4467   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4468   if (orig_stmt)
4469     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4470                                                        reduc_def_stmt,
4471                                                        !nested_cycle,
4472                                                        &dummy));
4473   else
4474     {
4475       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4476                                              !nested_cycle, &dummy);
4477       /* We changed STMT to be the first stmt in reduction chain, hence we
4478          check that in this case the first element in the chain is STMT.  */
4479       gcc_assert (stmt == tmp
4480                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4481     }
4482
4483   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4484     return false;
4485
4486   if (slp_node || PURE_SLP_STMT (stmt_info))
4487     ncopies = 1;
4488   else
4489     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4490                / TYPE_VECTOR_SUBPARTS (vectype_in));
4491
4492   gcc_assert (ncopies >= 1);
4493
4494   vec_mode = TYPE_MODE (vectype_in);
4495
4496   if (code == COND_EXPR)
4497     {
4498       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0))
4499         {
4500           if (vect_print_dump_info (REPORT_DETAILS))
4501             fprintf (vect_dump, "unsupported condition in reduction");
4502
4503             return false;
4504         }
4505     }
4506   else
4507     {
4508       /* 4. Supportable by target?  */
4509
4510       /* 4.1. check support for the operation in the loop  */
4511       optab = optab_for_tree_code (code, vectype_in, optab_default);
4512       if (!optab)
4513         {
4514           if (vect_print_dump_info (REPORT_DETAILS))
4515             fprintf (vect_dump, "no optab.");
4516
4517           return false;
4518         }
4519
4520       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4521         {
4522           if (vect_print_dump_info (REPORT_DETAILS))
4523             fprintf (vect_dump, "op not supported by target.");
4524
4525           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4526               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4527                   < vect_min_worthwhile_factor (code))
4528             return false;
4529
4530           if (vect_print_dump_info (REPORT_DETAILS))
4531             fprintf (vect_dump, "proceeding using word mode.");
4532         }
4533
4534       /* Worthwhile without SIMD support?  */
4535       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4536           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4537              < vect_min_worthwhile_factor (code))
4538         {
4539           if (vect_print_dump_info (REPORT_DETAILS))
4540             fprintf (vect_dump, "not worthwhile without SIMD support.");
4541
4542           return false;
4543         }
4544     }
4545
4546   /* 4.2. Check support for the epilog operation.
4547
4548           If STMT represents a reduction pattern, then the type of the
4549           reduction variable may be different than the type of the rest
4550           of the arguments.  For example, consider the case of accumulation
4551           of shorts into an int accumulator; The original code:
4552                         S1: int_a = (int) short_a;
4553           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
4554
4555           was replaced with:
4556                         STMT: int_acc = widen_sum <short_a, int_acc>
4557
4558           This means that:
4559           1. The tree-code that is used to create the vector operation in the
4560              epilog code (that reduces the partial results) is not the
4561              tree-code of STMT, but is rather the tree-code of the original
4562              stmt from the pattern that STMT is replacing.  I.e, in the example
4563              above we want to use 'widen_sum' in the loop, but 'plus' in the
4564              epilog.
4565           2. The type (mode) we use to check available target support
4566              for the vector operation to be created in the *epilog*, is
4567              determined by the type of the reduction variable (in the example
4568              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4569              However the type (mode) we use to check available target support
4570              for the vector operation to be created *inside the loop*, is
4571              determined by the type of the other arguments to STMT (in the
4572              example we'd check this: optab_handler (widen_sum_optab,
4573              vect_short_mode)).
4574
4575           This is contrary to "regular" reductions, in which the types of all
4576           the arguments are the same as the type of the reduction variable.
4577           For "regular" reductions we can therefore use the same vector type
4578           (and also the same tree-code) when generating the epilog code and
4579           when generating the code inside the loop.  */
4580
4581   if (orig_stmt)
4582     {
4583       /* This is a reduction pattern: get the vectype from the type of the
4584          reduction variable, and get the tree-code from orig_stmt.  */
4585       orig_code = gimple_assign_rhs_code (orig_stmt);
4586       gcc_assert (vectype_out);
4587       vec_mode = TYPE_MODE (vectype_out);
4588     }
4589   else
4590     {
4591       /* Regular reduction: use the same vectype and tree-code as used for
4592          the vector code inside the loop can be used for the epilog code. */
4593       orig_code = code;
4594     }
4595
4596   if (nested_cycle)
4597     {
4598       def_bb = gimple_bb (reduc_def_stmt);
4599       def_stmt_loop = def_bb->loop_father;
4600       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4601                                        loop_preheader_edge (def_stmt_loop));
4602       if (TREE_CODE (def_arg) == SSA_NAME
4603           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4604           && gimple_code (def_arg_stmt) == GIMPLE_PHI
4605           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4606           && vinfo_for_stmt (def_arg_stmt)
4607           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4608               == vect_double_reduction_def)
4609         double_reduc = true;
4610     }
4611
4612   epilog_reduc_code = ERROR_MARK;
4613   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4614     {
4615       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4616                                          optab_default);
4617       if (!reduc_optab)
4618         {
4619           if (vect_print_dump_info (REPORT_DETAILS))
4620             fprintf (vect_dump, "no optab for reduction.");
4621
4622           epilog_reduc_code = ERROR_MARK;
4623         }
4624
4625       if (reduc_optab
4626           && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4627         {
4628           if (vect_print_dump_info (REPORT_DETAILS))
4629             fprintf (vect_dump, "reduc op not supported by target.");
4630
4631           epilog_reduc_code = ERROR_MARK;
4632         }
4633     }
4634   else
4635     {
4636       if (!nested_cycle || double_reduc)
4637         {
4638           if (vect_print_dump_info (REPORT_DETAILS))
4639             fprintf (vect_dump, "no reduc code for scalar code.");
4640
4641           return false;
4642         }
4643     }
4644
4645   if (double_reduc && ncopies > 1)
4646     {
4647       if (vect_print_dump_info (REPORT_DETAILS))
4648         fprintf (vect_dump, "multiple types in double reduction");
4649
4650       return false;
4651     }
4652
4653   /* In case of widenning multiplication by a constant, we update the type
4654      of the constant to be the type of the other operand.  We check that the
4655      constant fits the type in the pattern recognition pass.  */
4656   if (code == DOT_PROD_EXPR
4657       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
4658     {
4659       if (TREE_CODE (ops[0]) == INTEGER_CST)
4660         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
4661       else if (TREE_CODE (ops[1]) == INTEGER_CST)
4662         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
4663       else
4664         {
4665           if (vect_print_dump_info (REPORT_DETAILS))
4666             fprintf (vect_dump, "invalid types in dot-prod");
4667
4668           return false;
4669         }
4670     }
4671
4672   if (!vec_stmt) /* transformation not required.  */
4673     {
4674       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4675         return false;
4676       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4677       return true;
4678     }
4679
4680   /** Transform.  **/
4681
4682   if (vect_print_dump_info (REPORT_DETAILS))
4683     fprintf (vect_dump, "transform reduction.");
4684
4685   /* FORNOW: Multiple types are not supported for condition.  */
4686   if (code == COND_EXPR)
4687     gcc_assert (ncopies == 1);
4688
4689   /* Create the destination vector  */
4690   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4691
4692   /* In case the vectorization factor (VF) is bigger than the number
4693      of elements that we can fit in a vectype (nunits), we have to generate
4694      more than one vector stmt - i.e - we need to "unroll" the
4695      vector stmt by a factor VF/nunits.  For more details see documentation
4696      in vectorizable_operation.  */
4697
4698   /* If the reduction is used in an outer loop we need to generate
4699      VF intermediate results, like so (e.g. for ncopies=2):
4700         r0 = phi (init, r0)
4701         r1 = phi (init, r1)
4702         r0 = x0 + r0;
4703         r1 = x1 + r1;
4704     (i.e. we generate VF results in 2 registers).
4705     In this case we have a separate def-use cycle for each copy, and therefore
4706     for each copy we get the vector def for the reduction variable from the
4707     respective phi node created for this copy.
4708
4709     Otherwise (the reduction is unused in the loop nest), we can combine
4710     together intermediate results, like so (e.g. for ncopies=2):
4711         r = phi (init, r)
4712         r = x0 + r;
4713         r = x1 + r;
4714    (i.e. we generate VF/2 results in a single register).
4715    In this case for each copy we get the vector def for the reduction variable
4716    from the vectorized reduction operation generated in the previous iteration.
4717   */
4718
4719   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
4720     {
4721       single_defuse_cycle = true;
4722       epilog_copies = 1;
4723     }
4724   else
4725     epilog_copies = ncopies;
4726
4727   prev_stmt_info = NULL;
4728   prev_phi_info = NULL;
4729   if (slp_node)
4730     {
4731       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4732       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
4733                   == TYPE_VECTOR_SUBPARTS (vectype_in));
4734     }
4735   else
4736     {
4737       vec_num = 1;
4738       vec_oprnds0 = VEC_alloc (tree, heap, 1);
4739       if (op_type == ternary_op)
4740         vec_oprnds1 = VEC_alloc (tree, heap, 1);
4741     }
4742
4743   phis = VEC_alloc (gimple, heap, vec_num);
4744   vect_defs = VEC_alloc (tree, heap, vec_num);
4745   if (!slp_node)
4746     VEC_quick_push (tree, vect_defs, NULL_TREE);
4747
4748   for (j = 0; j < ncopies; j++)
4749     {
4750       if (j == 0 || !single_defuse_cycle)
4751         {
4752           for (i = 0; i < vec_num; i++)
4753             {
4754               /* Create the reduction-phi that defines the reduction
4755                  operand.  */
4756               new_phi = create_phi_node (vec_dest, loop->header);
4757               set_vinfo_for_stmt (new_phi,
4758                                   new_stmt_vec_info (new_phi, loop_vinfo,
4759                                                      NULL));
4760                if (j == 0 || slp_node)
4761                  VEC_quick_push (gimple, phis, new_phi);
4762             }
4763         }
4764
4765       if (code == COND_EXPR)
4766         {
4767           gcc_assert (!slp_node);
4768           vectorizable_condition (stmt, gsi, vec_stmt,
4769                                   PHI_RESULT (VEC_index (gimple, phis, 0)),
4770                                   reduc_index);
4771           /* Multiple types are not supported for condition.  */
4772           break;
4773         }
4774
4775       /* Handle uses.  */
4776       if (j == 0)
4777         {
4778           op0 = ops[!reduc_index];
4779           if (op_type == ternary_op)
4780             {
4781               if (reduc_index == 0)
4782                 op1 = ops[2];
4783               else
4784                 op1 = ops[1];
4785             }
4786
4787           if (slp_node)
4788             vect_get_slp_defs (op0, op1, slp_node, &vec_oprnds0, &vec_oprnds1,
4789                                -1);
4790           else
4791             {
4792               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
4793                                                             stmt, NULL);
4794               VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
4795               if (op_type == ternary_op)
4796                {
4797                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
4798                                                                NULL);
4799                  VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
4800                }
4801             }
4802         }
4803       else
4804         {
4805           if (!slp_node)
4806             {
4807               enum vect_def_type dt;
4808               gimple dummy_stmt;
4809               tree dummy;
4810
4811               vect_is_simple_use (ops[!reduc_index], loop_vinfo, NULL,
4812                                   &dummy_stmt, &dummy, &dt);
4813               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
4814                                                               loop_vec_def0);
4815               VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
4816               if (op_type == ternary_op)
4817                 {
4818                   vect_is_simple_use (op1, loop_vinfo, NULL, &dummy_stmt,
4819                                       &dummy, &dt);
4820                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
4821                                                                 loop_vec_def1);
4822                   VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
4823                 }
4824             }
4825
4826           if (single_defuse_cycle)
4827             reduc_def = gimple_assign_lhs (new_stmt);
4828
4829           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
4830         }
4831
4832       FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, def0)
4833         {
4834           if (slp_node)
4835             reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
4836           else
4837             {
4838               if (!single_defuse_cycle || j == 0)
4839                 reduc_def = PHI_RESULT (new_phi);
4840             }
4841
4842           def1 = ((op_type == ternary_op)
4843                   ? VEC_index (tree, vec_oprnds1, i) : NULL);
4844           if (op_type == binary_op)
4845             {
4846               if (reduc_index == 0)
4847                 expr = build2 (code, vectype_out, reduc_def, def0);
4848               else
4849                 expr = build2 (code, vectype_out, def0, reduc_def);
4850             }
4851           else
4852             {
4853               if (reduc_index == 0)
4854                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
4855               else
4856                 {
4857                   if (reduc_index == 1)
4858                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
4859                   else
4860                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
4861                 }
4862             }
4863
4864           new_stmt = gimple_build_assign (vec_dest, expr);
4865           new_temp = make_ssa_name (vec_dest, new_stmt);
4866           gimple_assign_set_lhs (new_stmt, new_temp);
4867           vect_finish_stmt_generation (stmt, new_stmt, gsi);
4868
4869           if (slp_node)
4870             {
4871               VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4872               VEC_quick_push (tree, vect_defs, new_temp);
4873             }
4874           else
4875             VEC_replace (tree, vect_defs, 0, new_temp);
4876         }
4877
4878       if (slp_node)
4879         continue;
4880
4881       if (j == 0)
4882         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4883       else
4884         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4885
4886       prev_stmt_info = vinfo_for_stmt (new_stmt);
4887       prev_phi_info = vinfo_for_stmt (new_phi);
4888     }
4889
4890   /* Finalize the reduction-phi (set its arguments) and create the
4891      epilog reduction code.  */
4892   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
4893     {
4894       new_temp = gimple_assign_lhs (*vec_stmt);
4895       VEC_replace (tree, vect_defs, 0, new_temp);
4896     }
4897
4898   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
4899                                     epilog_reduc_code, phis, reduc_index,
4900                                     double_reduc, slp_node);
4901
4902   VEC_free (gimple, heap, phis);
4903   VEC_free (tree, heap, vec_oprnds0);
4904   if (vec_oprnds1)
4905     VEC_free (tree, heap, vec_oprnds1);
4906
4907   return true;
4908 }
4909
4910 /* Function vect_min_worthwhile_factor.
4911
4912    For a loop where we could vectorize the operation indicated by CODE,
4913    return the minimum vectorization factor that makes it worthwhile
4914    to use generic vectors.  */
4915 int
4916 vect_min_worthwhile_factor (enum tree_code code)
4917 {
4918   switch (code)
4919     {
4920     case PLUS_EXPR:
4921     case MINUS_EXPR:
4922     case NEGATE_EXPR:
4923       return 4;
4924
4925     case BIT_AND_EXPR:
4926     case BIT_IOR_EXPR:
4927     case BIT_XOR_EXPR:
4928     case BIT_NOT_EXPR:
4929       return 2;
4930
4931     default:
4932       return INT_MAX;
4933     }
4934 }
4935
4936
4937 /* Function vectorizable_induction
4938
4939    Check if PHI performs an induction computation that can be vectorized.
4940    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
4941    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
4942    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
4943
4944 bool
4945 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4946                         gimple *vec_stmt)
4947 {
4948   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
4949   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4950   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4951   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4952   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4953   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4954   tree vec_def;
4955
4956   gcc_assert (ncopies >= 1);
4957   /* FORNOW. This restriction should be relaxed.  */
4958   if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
4959     {
4960       if (vect_print_dump_info (REPORT_DETAILS))
4961         fprintf (vect_dump, "multiple types in nested loop.");
4962       return false;
4963     }
4964
4965   if (!STMT_VINFO_RELEVANT_P (stmt_info))
4966     return false;
4967
4968   /* FORNOW: SLP not supported.  */
4969   if (STMT_SLP_TYPE (stmt_info))
4970     return false;
4971
4972   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
4973
4974   if (gimple_code (phi) != GIMPLE_PHI)
4975     return false;
4976
4977   if (!vec_stmt) /* transformation not required.  */
4978     {
4979       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
4980       if (vect_print_dump_info (REPORT_DETAILS))
4981         fprintf (vect_dump, "=== vectorizable_induction ===");
4982       vect_model_induction_cost (stmt_info, ncopies);
4983       return true;
4984     }
4985
4986   /** Transform.  **/
4987
4988   if (vect_print_dump_info (REPORT_DETAILS))
4989     fprintf (vect_dump, "transform induction phi.");
4990
4991   vec_def = get_initial_def_for_induction (phi);
4992   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
4993   return true;
4994 }
4995
4996 /* Function vectorizable_live_operation.
4997
4998    STMT computes a value that is used outside the loop.  Check if
4999    it can be supported.  */
5000
5001 bool
5002 vectorizable_live_operation (gimple stmt,
5003                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5004                              gimple *vec_stmt ATTRIBUTE_UNUSED)
5005 {
5006   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5007   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5008   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5009   int i;
5010   int op_type;
5011   tree op;
5012   tree def;
5013   gimple def_stmt;
5014   enum vect_def_type dt;
5015   enum tree_code code;
5016   enum gimple_rhs_class rhs_class;
5017
5018   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5019
5020   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5021     return false;
5022
5023   if (!is_gimple_assign (stmt))
5024     return false;
5025
5026   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5027     return false;
5028
5029   /* FORNOW. CHECKME. */
5030   if (nested_in_vect_loop_p (loop, stmt))
5031     return false;
5032
5033   code = gimple_assign_rhs_code (stmt);
5034   op_type = TREE_CODE_LENGTH (code);
5035   rhs_class = get_gimple_rhs_class (code);
5036   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5037   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5038
5039   /* FORNOW: support only if all uses are invariant.  This means
5040      that the scalar operations can remain in place, unvectorized.
5041      The original last scalar value that they compute will be used.  */
5042
5043   for (i = 0; i < op_type; i++)
5044     {
5045       if (rhs_class == GIMPLE_SINGLE_RHS)
5046         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5047       else
5048         op = gimple_op (stmt, i + 1);
5049       if (op
5050           && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
5051         {
5052           if (vect_print_dump_info (REPORT_DETAILS))
5053             fprintf (vect_dump, "use not simple.");
5054           return false;
5055         }
5056
5057       if (dt != vect_external_def && dt != vect_constant_def)
5058         return false;
5059     }
5060
5061   /* No transformation is required for the cases we currently support.  */
5062   return true;
5063 }
5064
5065 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5066
5067 static void
5068 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5069 {
5070   ssa_op_iter op_iter;
5071   imm_use_iterator imm_iter;
5072   def_operand_p def_p;
5073   gimple ustmt;
5074
5075   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5076     {
5077       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5078         {
5079           basic_block bb;
5080
5081           if (!is_gimple_debug (ustmt))
5082             continue;
5083
5084           bb = gimple_bb (ustmt);
5085
5086           if (!flow_bb_inside_loop_p (loop, bb))
5087             {
5088               if (gimple_debug_bind_p (ustmt))
5089                 {
5090                   if (vect_print_dump_info (REPORT_DETAILS))
5091                     fprintf (vect_dump, "killing debug use");
5092
5093                   gimple_debug_bind_reset_value (ustmt);
5094                   update_stmt (ustmt);
5095                 }
5096               else
5097                 gcc_unreachable ();
5098             }
5099         }
5100     }
5101 }
5102
5103 /* Function vect_transform_loop.
5104
5105    The analysis phase has determined that the loop is vectorizable.
5106    Vectorize the loop - created vectorized stmts to replace the scalar
5107    stmts in the loop, and update the loop exit condition.  */
5108
5109 void
5110 vect_transform_loop (loop_vec_info loop_vinfo)
5111 {
5112   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5113   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5114   int nbbs = loop->num_nodes;
5115   gimple_stmt_iterator si;
5116   int i;
5117   tree ratio = NULL;
5118   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5119   bool strided_store;
5120   bool slp_scheduled = false;
5121   unsigned int nunits;
5122   tree cond_expr = NULL_TREE;
5123   gimple_seq cond_expr_stmt_list = NULL;
5124   bool do_peeling_for_loop_bound;
5125   gimple stmt, pattern_stmt, pattern_def_stmt;
5126   bool transform_pattern_stmt = false, pattern_def = false;
5127
5128   if (vect_print_dump_info (REPORT_DETAILS))
5129     fprintf (vect_dump, "=== vec_transform_loop ===");
5130
5131   /* Peel the loop if there are data refs with unknown alignment.
5132      Only one data ref with unknown store is allowed.  */
5133
5134   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5135     vect_do_peeling_for_alignment (loop_vinfo);
5136
5137   do_peeling_for_loop_bound
5138     = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5139        || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5140            && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
5141        || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
5142
5143   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5144       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5145     vect_loop_versioning (loop_vinfo,
5146                           !do_peeling_for_loop_bound,
5147                           &cond_expr, &cond_expr_stmt_list);
5148
5149   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5150      compile time constant), or it is a constant that doesn't divide by the
5151      vectorization factor, then an epilog loop needs to be created.
5152      We therefore duplicate the loop: the original loop will be vectorized,
5153      and will compute the first (n/VF) iterations.  The second copy of the loop
5154      will remain scalar and will compute the remaining (n%VF) iterations.
5155      (VF is the vectorization factor).  */
5156
5157   if (do_peeling_for_loop_bound)
5158     vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
5159                                     cond_expr, cond_expr_stmt_list);
5160   else
5161     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5162                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5163
5164   /* 1) Make sure the loop header has exactly two entries
5165      2) Make sure we have a preheader basic block.  */
5166
5167   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5168
5169   split_edge (loop_preheader_edge (loop));
5170
5171   /* FORNOW: the vectorizer supports only loops which body consist
5172      of one basic block (header + empty latch). When the vectorizer will
5173      support more involved loop forms, the order by which the BBs are
5174      traversed need to be reconsidered.  */
5175
5176   for (i = 0; i < nbbs; i++)
5177     {
5178       basic_block bb = bbs[i];
5179       stmt_vec_info stmt_info;
5180       gimple phi;
5181
5182       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5183         {
5184           phi = gsi_stmt (si);
5185           if (vect_print_dump_info (REPORT_DETAILS))
5186             {
5187               fprintf (vect_dump, "------>vectorizing phi: ");
5188               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
5189             }
5190           stmt_info = vinfo_for_stmt (phi);
5191           if (!stmt_info)
5192             continue;
5193
5194           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5195             vect_loop_kill_debug_uses (loop, phi);
5196
5197           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5198               && !STMT_VINFO_LIVE_P (stmt_info))
5199             continue;
5200
5201           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5202                 != (unsigned HOST_WIDE_INT) vectorization_factor)
5203               && vect_print_dump_info (REPORT_DETAILS))
5204             fprintf (vect_dump, "multiple-types.");
5205
5206           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5207             {
5208               if (vect_print_dump_info (REPORT_DETAILS))
5209                 fprintf (vect_dump, "transform phi.");
5210               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5211             }
5212         }
5213
5214       pattern_stmt = NULL;
5215       for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5216         {
5217           bool is_store;
5218
5219           if (transform_pattern_stmt)
5220             {
5221               stmt = pattern_stmt;
5222               transform_pattern_stmt = false;
5223             }
5224           else
5225             stmt = gsi_stmt (si);
5226
5227           if (vect_print_dump_info (REPORT_DETAILS))
5228             {
5229               fprintf (vect_dump, "------>vectorizing statement: ");
5230               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
5231             }
5232
5233           stmt_info = vinfo_for_stmt (stmt);
5234
5235           /* vector stmts created in the outer-loop during vectorization of
5236              stmts in an inner-loop may not have a stmt_info, and do not
5237              need to be vectorized.  */
5238           if (!stmt_info)
5239             {
5240               gsi_next (&si);
5241               continue;
5242             }
5243
5244           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5245             vect_loop_kill_debug_uses (loop, stmt);
5246
5247           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5248               && !STMT_VINFO_LIVE_P (stmt_info))
5249             {
5250               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5251                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5252                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5253                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5254                 {
5255                   stmt = pattern_stmt;
5256                   stmt_info = vinfo_for_stmt (stmt);
5257                 }
5258               else
5259                 {
5260                   gsi_next (&si);
5261                   continue;
5262                 }
5263             }
5264           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5265                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5266                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5267                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5268             transform_pattern_stmt = true;
5269
5270           /* If pattern statement has a def stmt, vectorize it too.  */
5271           if (is_pattern_stmt_p (stmt_info)
5272               && (pattern_def_stmt = STMT_VINFO_PATTERN_DEF_STMT (stmt_info))
5273               && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_def_stmt))
5274                   || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_def_stmt))))
5275             {
5276               if (pattern_def)
5277                 pattern_def = false;
5278               else
5279                 {
5280                   if (vect_print_dump_info (REPORT_DETAILS))
5281                     {
5282                       fprintf (vect_dump, "==> vectorizing pattern def"
5283                                           " stmt: ");
5284                       print_gimple_stmt (vect_dump, pattern_def_stmt, 0,
5285                                          TDF_SLIM);
5286                     }
5287
5288                   pattern_def = true;
5289                   stmt = pattern_def_stmt;
5290                   stmt_info = vinfo_for_stmt (stmt);
5291                 }
5292             }
5293
5294           gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5295           nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
5296                                                STMT_VINFO_VECTYPE (stmt_info));
5297           if (!STMT_SLP_TYPE (stmt_info)
5298               && nunits != (unsigned int) vectorization_factor
5299               && vect_print_dump_info (REPORT_DETAILS))
5300             /* For SLP VF is set according to unrolling factor, and not to
5301                vector size, hence for SLP this print is not valid.  */
5302             fprintf (vect_dump, "multiple-types.");
5303
5304           /* SLP. Schedule all the SLP instances when the first SLP stmt is
5305              reached.  */
5306           if (STMT_SLP_TYPE (stmt_info))
5307             {
5308               if (!slp_scheduled)
5309                 {
5310                   slp_scheduled = true;
5311
5312                   if (vect_print_dump_info (REPORT_DETAILS))
5313                     fprintf (vect_dump, "=== scheduling SLP instances ===");
5314
5315                   vect_schedule_slp (loop_vinfo, NULL);
5316                 }
5317
5318               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
5319               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5320                 {
5321                   if (!transform_pattern_stmt && !pattern_def)
5322                     gsi_next (&si);
5323                   continue;
5324                 }
5325             }
5326
5327           /* -------- vectorize statement ------------ */
5328           if (vect_print_dump_info (REPORT_DETAILS))
5329             fprintf (vect_dump, "transform statement.");
5330
5331           strided_store = false;
5332           is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
5333           if (is_store)
5334             {
5335               if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5336                 {
5337                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5338                      interleaving chain was completed - free all the stores in
5339                      the chain.  */
5340                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
5341                   gsi_remove (&si, true);
5342                   continue;
5343                 }
5344               else
5345                 {
5346                   /* Free the attached stmt_vec_info and remove the stmt.  */
5347                   free_stmt_vec_info (stmt);
5348                   gsi_remove (&si, true);
5349                   continue;
5350                 }
5351             }
5352
5353           if (!transform_pattern_stmt && !pattern_def)
5354             gsi_next (&si);
5355         }                       /* stmts in BB */
5356     }                           /* BBs in loop */
5357
5358   slpeel_make_loop_iterate_ntimes (loop, ratio);
5359
5360   /* The memory tags and pointers in vectorized statements need to
5361      have their SSA forms updated.  FIXME, why can't this be delayed
5362      until all the loops have been transformed?  */
5363   update_ssa (TODO_update_ssa);
5364
5365   if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
5366     fprintf (vect_dump, "LOOP VECTORIZED.");
5367   if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
5368     fprintf (vect_dump, "OUTER LOOP VECTORIZED.");
5369 }