gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
   3    Free Software Foundation, Inc.
   4    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   5    Ira Rosen <irar@il.ibm.com>
   6
   7 This file is part of GCC.
   8
   9 GCC is free software; you can redistribute it and/or modify it under
  10 the terms of the GNU General Public License as published by the Free
  11 Software Foundation; either version 3, or (at your option) any later
  12 version.
  13
  14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  16 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  17 for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with GCC; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "tm.h"
  27 #include "ggc.h"
  28 #include "tree.h"
  29 #include "basic-block.h"
  30 #include "tree-pretty-print.h"
  31 #include "gimple-pretty-print.h"
  32 #include "tree-flow.h"
  33 #include "tree-dump.h"
  34 #include "cfgloop.h"
  35 #include "cfglayout.h"
  36 #include "expr.h"
  37 #include "recog.h"
  38 #include "optabs.h"
  39 #include "params.h"
  40 #include "diagnostic-core.h"
  41 #include "tree-chrec.h"
  42 #include "tree-scalar-evolution.h"
  43 #include "tree-vectorizer.h"
  44 #include "target.h"
  45
  46 /* Loop Vectorization Pass.
  47
  48    This pass tries to vectorize loops.
  49
  50    For example, the vectorizer transforms the following simple loop:
  51
  52         short a[N]; short b[N]; short c[N]; int i;
  53
  54         for (i=0; i<N; i++){
  55           a[i] = b[i] + c[i];
  56         }
  57
  58    as if it was manually vectorized by rewriting the source code into:
  59
  60         typedef int __attribute__((mode(V8HI))) v8hi;
  61         short a[N];  short b[N]; short c[N];   int i;
  62         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  63         v8hi va, vb, vc;
  64
  65         for (i=0; i<N/8; i++){
  66           vb = pb[i];
  67           vc = pc[i];
  68           va = vb + vc;
  69           pa[i] = va;
  70         }
  71
  72         The main entry to this pass is vectorize_loops(), in which
  73    the vectorizer applies a set of analyses on a given set of loops,
  74    followed by the actual vectorization transformation for the loops that
  75    had successfully passed the analysis phase.
  76         Throughout this pass we make a distinction between two types of
  77    data: scalars (which are represented by SSA_NAMES), and memory references
  78    ("data-refs").  These two types of data require different handling both
  79    during analysis and transformation. The types of data-refs that the
  80    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  81    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  82    accesses are required to have a simple (consecutive) access pattern.
  83
  84    Analysis phase:
  85    ===============
  86         The driver for the analysis phase is vect_analyze_loop().
  87    It applies a set of analyses, some of which rely on the scalar evolution
  88    analyzer (scev) developed by Sebastian Pop.
  89
  90         During the analysis phase the vectorizer records some information
  91    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  92    loop, as well as general information about the loop as a whole, which is
  93    recorded in a "loop_vec_info" struct attached to each loop.
  94
  95    Transformation phase:
  96    =====================
  97         The loop transformation phase scans all the stmts in the loop, and
  98    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
  99    the loop that needs to be vectorized.  It inserts the vector code sequence
 100    just before the scalar stmt S, and records a pointer to the vector code
 101    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 102    attached to S).  This pointer will be used for the vectorization of following
 103    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 104    otherwise, we rely on dead code elimination for removing it.
 105
 106         For example, say stmt S1 was vectorized into stmt VS1:
 107
 108    VS1: vb = px[i];
 109    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 110    S2:  a = b;
 111
 112    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 113    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 114    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 115    resulting sequence would be:
 116
 117    VS1: vb = px[i];
 118    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 119    VS2: va = vb;
 120    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 121
 122         Operands that are not SSA_NAMEs, are data-refs that appear in
 123    load/store operations (like 'x[i]' in S1), and are handled differently.
 124
 125    Target modeling:
 126    =================
 127         Currently the only target specific information that is used is the
 128    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 129    Targets that can support different sizes of vectors, for now will need
 130    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 131    flexibility will be added in the future.
 132
 133         Since we only vectorize operations which vector form can be
 134    expressed using existing tree codes, to verify that an operation is
 135    supported, the vectorizer checks the relevant optab at the relevant
 136    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 137    the value found is CODE_FOR_nothing, then there's no target support, and
 138    we can't vectorize the stmt.
 139
 140    For additional information on this project see:
 141    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 142 */
 143
 144 /* Function vect_determine_vectorization_factor
 145
 146    Determine the vectorization factor (VF).  VF is the number of data elements
 147    that are operated upon in parallel in a single iteration of the vectorized
 148    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 149    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 150    elements can fit in a single vector register.
 151
 152    We currently support vectorization of loops in which all types operated upon
 153    are of the same size.  Therefore this function currently sets VF according to
 154    the size of the types operated upon, and fails if there are multiple sizes
 155    in the loop.
 156
 157    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 158    original loop:
 159         for (i=0; i<N; i++){
 160           a[i] = b[i] + c[i];
 161         }
 162
 163    vectorized loop:
 164         for (i=0; i<N; i+=VF){
 165           a[i:VF] = b[i:VF] + c[i:VF];
 166         }
 167 */
 168
 169 static bool
 170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 171 {
 172   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 173   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 174   int nbbs = loop->num_nodes;
 175   gimple_stmt_iterator si;
 176   unsigned int vectorization_factor = 0;
 177   tree scalar_type;
 178   gimple phi;
 179   tree vectype;
 180   unsigned int nunits;
 181   stmt_vec_info stmt_info;
 182   int i;
 183   HOST_WIDE_INT dummy;
 184   gimple stmt, pattern_stmt = NULL, pattern_def_stmt = NULL;
 185   bool analyze_pattern_stmt = false, pattern_def = false;
 186
 187   if (vect_print_dump_info (REPORT_DETAILS))
 188     fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
 189
 190   for (i = 0; i < nbbs; i++)
 191     {
 192       basic_block bb = bbs[i];
 193
 194       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 195         {
 196           phi = gsi_stmt (si);
 197           stmt_info = vinfo_for_stmt (phi);
 198           if (vect_print_dump_info (REPORT_DETAILS))
 199             {
 200               fprintf (vect_dump, "==> examining phi: ");
 201               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 202             }
 203
 204           gcc_assert (stmt_info);
 205
 206           if (STMT_VINFO_RELEVANT_P (stmt_info))
 207             {
 208               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 209               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 210
 211               if (vect_print_dump_info (REPORT_DETAILS))
 212                 {
 213                   fprintf (vect_dump, "get vectype for scalar type:  ");
 214                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 215                 }
 216
 217               vectype = get_vectype_for_scalar_type (scalar_type);
 218               if (!vectype)
 219                 {
 220                   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 221                     {
 222                       fprintf (vect_dump,
 223                                "not vectorized: unsupported data-type ");
 224                       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 225                     }
 226                   return false;
 227                 }
 228               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 229
 230               if (vect_print_dump_info (REPORT_DETAILS))
 231                 {
 232                   fprintf (vect_dump, "vectype: ");
 233                   print_generic_expr (vect_dump, vectype, TDF_SLIM);
 234                 }
 235
 236               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 237               if (vect_print_dump_info (REPORT_DETAILS))
 238                 fprintf (vect_dump, "nunits = %d", nunits);
 239
 240               if (!vectorization_factor
 241                   || (nunits > vectorization_factor))
 242                 vectorization_factor = nunits;
 243             }
 244         }
 245
 246       for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
 247         {
 248           tree vf_vectype;
 249
 250           if (analyze_pattern_stmt)
 251             {
 252               stmt = pattern_stmt;
 253               analyze_pattern_stmt = false;
 254             }
 255           else
 256             stmt = gsi_stmt (si);
 257
 258           stmt_info = vinfo_for_stmt (stmt);
 259
 260           if (vect_print_dump_info (REPORT_DETAILS))
 261             {
 262               fprintf (vect_dump, "==> examining statement: ");
 263               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 264             }
 265
 266           gcc_assert (stmt_info);
 267
 268           /* Skip stmts which do not need to be vectorized.  */
 269           if (!STMT_VINFO_RELEVANT_P (stmt_info)
 270               && !STMT_VINFO_LIVE_P (stmt_info))
 271             {
 272               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 273                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 274                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 275                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 276                 {
 277                   stmt = pattern_stmt;
 278                   stmt_info = vinfo_for_stmt (pattern_stmt);
 279                   if (vect_print_dump_info (REPORT_DETAILS))
 280                     {
 281                       fprintf (vect_dump, "==> examining pattern statement: ");
 282                       print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 283                     }
 284                 }
 285               else
 286                 {
 287                   if (vect_print_dump_info (REPORT_DETAILS))
 288                     fprintf (vect_dump, "skip.");
 289                   gsi_next (&si);
 290                   continue;
 291                 }
 292             }
 293           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 294                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 295                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 296                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 297             analyze_pattern_stmt = true;
 298
 299           /* If a pattern statement has a def stmt, analyze it too.  */
 300           if (is_pattern_stmt_p (stmt_info)
 301               && (pattern_def_stmt = STMT_VINFO_PATTERN_DEF_STMT (stmt_info))
 302               && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_def_stmt))
 303                   || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_def_stmt))))
 304             {
 305               if (pattern_def)
 306                 pattern_def = false;
 307               else
 308                 {
 309                   if (vect_print_dump_info (REPORT_DETAILS))
 310                     {
 311                       fprintf (vect_dump, "==> examining pattern def stmt: ");
 312                       print_gimple_stmt (vect_dump, pattern_def_stmt, 0,
 313                                          TDF_SLIM);
 314                     }
 315
 316                   pattern_def = true;
 317                   stmt = pattern_def_stmt;
 318                   stmt_info = vinfo_for_stmt (stmt);
 319                 }
 320             }
 321
 322           if (gimple_get_lhs (stmt) == NULL_TREE)
 323             {
 324               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 325                 {
 326                   fprintf (vect_dump, "not vectorized: irregular stmt.");
 327                   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 328                 }
 329               return false;
 330             }
 331
 332           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 333             {
 334               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 335                 {
 336                   fprintf (vect_dump, "not vectorized: vector stmt in loop:");
 337                   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 338                 }
 339               return false;
 340             }
 341
 342           if (STMT_VINFO_VECTYPE (stmt_info))
 343             {
 344               /* The only case when a vectype had been already set is for stmts
 345                  that contain a dataref, or for "pattern-stmts" (stmts
 346                  generated by the vectorizer to represent/replace a certain
 347                  idiom).  */
 348               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 349                           || is_pattern_stmt_p (stmt_info)
 350                           || pattern_def);
 351               vectype = STMT_VINFO_VECTYPE (stmt_info);
 352             }
 353           else
 354             {
 355               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 356               scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 357               if (vect_print_dump_info (REPORT_DETAILS))
 358                 {
 359                   fprintf (vect_dump, "get vectype for scalar type:  ");
 360                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 361                 }
 362               vectype = get_vectype_for_scalar_type (scalar_type);
 363               if (!vectype)
 364                 {
 365                   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 366                     {
 367                       fprintf (vect_dump,
 368                                "not vectorized: unsupported data-type ");
 369                       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 370                     }
 371                   return false;
 372                 }
 373
 374               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 375             }
 376
 377           /* The vectorization factor is according to the smallest
 378              scalar type (or the largest vector size, but we only
 379              support one vector size per loop).  */
 380           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 381                                                        &dummy);
 382           if (vect_print_dump_info (REPORT_DETAILS))
 383             {
 384               fprintf (vect_dump, "get vectype for scalar type:  ");
 385               print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 386             }
 387           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 388           if (!vf_vectype)
 389             {
 390               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 391                 {
 392                   fprintf (vect_dump,
 393                            "not vectorized: unsupported data-type ");
 394                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 395                 }
 396               return false;
 397             }
 398
 399           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 400                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 401             {
 402               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 403                 {
 404                   fprintf (vect_dump,
 405                            "not vectorized: different sized vector "
 406                            "types in statement, ");
 407                   print_generic_expr (vect_dump, vectype, TDF_SLIM);
 408                   fprintf (vect_dump, " and ");
 409                   print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
 410                 }
 411               return false;
 412             }
 413
 414           if (vect_print_dump_info (REPORT_DETAILS))
 415             {
 416               fprintf (vect_dump, "vectype: ");
 417               print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
 418             }
 419
 420           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 421           if (vect_print_dump_info (REPORT_DETAILS))
 422             fprintf (vect_dump, "nunits = %d", nunits);
 423
 424           if (!vectorization_factor
 425               || (nunits > vectorization_factor))
 426             vectorization_factor = nunits;
 427
 428           if (!analyze_pattern_stmt && !pattern_def)
 429             gsi_next (&si);
 430         }
 431     }
 432
 433   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 434   if (vect_print_dump_info (REPORT_DETAILS))
 435     fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
 436   if (vectorization_factor <= 1)
 437     {
 438       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 439         fprintf (vect_dump, "not vectorized: unsupported data-type");
 440       return false;
 441     }
 442   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 443
 444   return true;
 445 }
 446
 447
 448 /* Function vect_is_simple_iv_evolution.
 449
 450    FORNOW: A simple evolution of an induction variables in the loop is
 451    considered a polynomial evolution with constant step.  */
 452
 453 static bool
 454 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 455                              tree * step)
 456 {
 457   tree init_expr;
 458   tree step_expr;
 459   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 460
 461   /* When there is no evolution in this loop, the evolution function
 462      is not "simple".  */
 463   if (evolution_part == NULL_TREE)
 464     return false;
 465
 466   /* When the evolution is a polynomial of degree >= 2
 467      the evolution function is not "simple".  */
 468   if (tree_is_chrec (evolution_part))
 469     return false;
 470
 471   step_expr = evolution_part;
 472   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 473
 474   if (vect_print_dump_info (REPORT_DETAILS))
 475     {
 476       fprintf (vect_dump, "step: ");
 477       print_generic_expr (vect_dump, step_expr, TDF_SLIM);
 478       fprintf (vect_dump, ",  init: ");
 479       print_generic_expr (vect_dump, init_expr, TDF_SLIM);
 480     }
 481
 482   *init = init_expr;
 483   *step = step_expr;
 484
 485   if (TREE_CODE (step_expr) != INTEGER_CST)
 486     {
 487       if (vect_print_dump_info (REPORT_DETAILS))
 488         fprintf (vect_dump, "step unknown.");
 489       return false;
 490     }
 491
 492   return true;
 493 }
 494
 495 /* Function vect_analyze_scalar_cycles_1.
 496
 497    Examine the cross iteration def-use cycles of scalar variables
 498    in LOOP.  LOOP_VINFO represents the loop that is now being
 499    considered for vectorization (can be LOOP, or an outer-loop
 500    enclosing LOOP).  */
 501
 502 static void
 503 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 504 {
 505   basic_block bb = loop->header;
 506   tree dumy;
 507   VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
 508   gimple_stmt_iterator gsi;
 509   bool double_reduc;
 510
 511   if (vect_print_dump_info (REPORT_DETAILS))
 512     fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
 513
 514   /* First - identify all inductions.  Reduction detection assumes that all the
 515      inductions have been identified, therefore, this order must not be
 516      changed.  */
 517   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 518     {
 519       gimple phi = gsi_stmt (gsi);
 520       tree access_fn = NULL;
 521       tree def = PHI_RESULT (phi);
 522       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 523
 524       if (vect_print_dump_info (REPORT_DETAILS))
 525         {
 526           fprintf (vect_dump, "Analyze phi: ");
 527           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 528         }
 529
 530       /* Skip virtual phi's.  The data dependences that are associated with
 531          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 532       if (!is_gimple_reg (SSA_NAME_VAR (def)))
 533         continue;
 534
 535       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 536
 537       /* Analyze the evolution function.  */
 538       access_fn = analyze_scalar_evolution (loop, def);
 539       if (access_fn)
 540         STRIP_NOPS (access_fn);
 541       if (access_fn && vect_print_dump_info (REPORT_DETAILS))
 542         {
 543           fprintf (vect_dump, "Access function of PHI: ");
 544           print_generic_expr (vect_dump, access_fn, TDF_SLIM);
 545         }
 546
 547       if (!access_fn
 548           || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
 549         {
 550           VEC_safe_push (gimple, heap, worklist, phi);
 551           continue;
 552         }
 553
 554       if (vect_print_dump_info (REPORT_DETAILS))
 555         fprintf (vect_dump, "Detected induction.");
 556       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 557     }
 558
 559
 560   /* Second - identify all reductions and nested cycles.  */
 561   while (VEC_length (gimple, worklist) > 0)
 562     {
 563       gimple phi = VEC_pop (gimple, worklist);
 564       tree def = PHI_RESULT (phi);
 565       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 566       gimple reduc_stmt;
 567       bool nested_cycle;
 568
 569       if (vect_print_dump_info (REPORT_DETAILS))
 570         {
 571           fprintf (vect_dump, "Analyze phi: ");
 572           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 573         }
 574
 575       gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
 576       gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 577
 578       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 579       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 580                                                 &double_reduc);
 581       if (reduc_stmt)
 582         {
 583           if (double_reduc)
 584             {
 585               if (vect_print_dump_info (REPORT_DETAILS))
 586                 fprintf (vect_dump, "Detected double reduction.");
 587
 588               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 589               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 590                                                     vect_double_reduction_def;
 591             }
 592           else
 593             {
 594               if (nested_cycle)
 595                 {
 596                   if (vect_print_dump_info (REPORT_DETAILS))
 597                     fprintf (vect_dump, "Detected vectorizable nested cycle.");
 598
 599                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 600                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 601                                                              vect_nested_cycle;
 602                 }
 603               else
 604                 {
 605                   if (vect_print_dump_info (REPORT_DETAILS))
 606                     fprintf (vect_dump, "Detected reduction.");
 607
 608                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 609                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 610                                                            vect_reduction_def;
 611                   /* Store the reduction cycles for possible vectorization in
 612                      loop-aware SLP.  */
 613                   VEC_safe_push (gimple, heap,
 614                                  LOOP_VINFO_REDUCTIONS (loop_vinfo),
 615                                  reduc_stmt);
 616                 }
 617             }
 618         }
 619       else
 620         if (vect_print_dump_info (REPORT_DETAILS))
 621           fprintf (vect_dump, "Unknown def-use cycle pattern.");
 622     }
 623
 624   VEC_free (gimple, heap, worklist);
 625 }
 626
 627
 628 /* Function vect_analyze_scalar_cycles.
 629
 630    Examine the cross iteration def-use cycles of scalar variables, by
 631    analyzing the loop-header PHIs of scalar variables.  Classify each
 632    cycle as one of the following: invariant, induction, reduction, unknown.
 633    We do that for the loop represented by LOOP_VINFO, and also to its
 634    inner-loop, if exists.
 635    Examples for scalar cycles:
 636
 637    Example1: reduction:
 638
 639               loop1:
 640               for (i=0; i<N; i++)
 641                  sum += a[i];
 642
 643    Example2: induction:
 644
 645               loop2:
 646               for (i=0; i<N; i++)
 647                  a[i] = i;  */
 648
 649 static void
 650 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 651 {
 652   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 653
 654   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 655
 656   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 657      Reductions in such inner-loop therefore have different properties than
 658      the reductions in the nest that gets vectorized:
 659      1. When vectorized, they are executed in the same order as in the original
 660         scalar loop, so we can't change the order of computation when
 661         vectorizing them.
 662      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 663         current checks are too strict.  */
 664
 665   if (loop->inner)
 666     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 667 }
 668
 669 /* Function vect_get_loop_niters.
 670
 671    Determine how many iterations the loop is executed.
 672    If an expression that represents the number of iterations
 673    can be constructed, place it in NUMBER_OF_ITERATIONS.
 674    Return the loop exit condition.  */
 675
 676 static gimple
 677 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 678 {
 679   tree niters;
 680
 681   if (vect_print_dump_info (REPORT_DETAILS))
 682     fprintf (vect_dump, "=== get_loop_niters ===");
 683
 684   niters = number_of_exit_cond_executions (loop);
 685
 686   if (niters != NULL_TREE
 687       && niters != chrec_dont_know)
 688     {
 689       *number_of_iterations = niters;
 690
 691       if (vect_print_dump_info (REPORT_DETAILS))
 692         {
 693           fprintf (vect_dump, "==> get_loop_niters:" );
 694           print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
 695         }
 696     }
 697
 698   return get_loop_exit_condition (loop);
 699 }
 700
 701
 702 /* Function bb_in_loop_p
 703
 704    Used as predicate for dfs order traversal of the loop bbs.  */
 705
 706 static bool
 707 bb_in_loop_p (const_basic_block bb, const void *data)
 708 {
 709   const struct loop *const loop = (const struct loop *)data;
 710   if (flow_bb_inside_loop_p (loop, bb))
 711     return true;
 712   return false;
 713 }
 714
 715
 716 /* Function new_loop_vec_info.
 717
 718    Create and initialize a new loop_vec_info struct for LOOP, as well as
 719    stmt_vec_info structs for all the stmts in LOOP.  */
 720
 721 static loop_vec_info
 722 new_loop_vec_info (struct loop *loop)
 723 {
 724   loop_vec_info res;
 725   basic_block *bbs;
 726   gimple_stmt_iterator si;
 727   unsigned int i, nbbs;
 728
 729   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 730   LOOP_VINFO_LOOP (res) = loop;
 731
 732   bbs = get_loop_body (loop);
 733
 734   /* Create/Update stmt_info for all stmts in the loop.  */
 735   for (i = 0; i < loop->num_nodes; i++)
 736     {
 737       basic_block bb = bbs[i];
 738
 739       /* BBs in a nested inner-loop will have been already processed (because
 740          we will have called vect_analyze_loop_form for any nested inner-loop).
 741          Therefore, for stmts in an inner-loop we just want to update the
 742          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 743          loop_info of the outer-loop we are currently considering to vectorize
 744          (instead of the loop_info of the inner-loop).
 745          For stmts in other BBs we need to create a stmt_info from scratch.  */
 746       if (bb->loop_father != loop)
 747         {
 748           /* Inner-loop bb.  */
 749           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 750           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 751             {
 752               gimple phi = gsi_stmt (si);
 753               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 754               loop_vec_info inner_loop_vinfo =
 755                 STMT_VINFO_LOOP_VINFO (stmt_info);
 756               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 757               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 758             }
 759           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 760            {
 761               gimple stmt = gsi_stmt (si);
 762               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 763               loop_vec_info inner_loop_vinfo =
 764                  STMT_VINFO_LOOP_VINFO (stmt_info);
 765               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 766               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 767            }
 768         }
 769       else
 770         {
 771           /* bb in current nest.  */
 772           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 773             {
 774               gimple phi = gsi_stmt (si);
 775               gimple_set_uid (phi, 0);
 776               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 777             }
 778
 779           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 780             {
 781               gimple stmt = gsi_stmt (si);
 782               gimple_set_uid (stmt, 0);
 783               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 784             }
 785         }
 786     }
 787
 788   /* CHECKME: We want to visit all BBs before their successors (except for
 789      latch blocks, for which this assertion wouldn't hold).  In the simple
 790      case of the loop forms we allow, a dfs order of the BBs would the same
 791      as reversed postorder traversal, so we are safe.  */
 792
 793    free (bbs);
 794    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 795    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 796                               bbs, loop->num_nodes, loop);
 797    gcc_assert (nbbs == loop->num_nodes);
 798
 799   LOOP_VINFO_BBS (res) = bbs;
 800   LOOP_VINFO_NITERS (res) = NULL;
 801   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 802   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 803   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 804   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
 805   LOOP_VINFO_VECT_FACTOR (res) = 0;
 806   LOOP_VINFO_LOOP_NEST (res) = VEC_alloc (loop_p, heap, 3);
 807   LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
 808   LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
 809   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 810   LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
 811     VEC_alloc (gimple, heap,
 812                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 813   LOOP_VINFO_MAY_ALIAS_DDRS (res) =
 814     VEC_alloc (ddr_p, heap,
 815                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 816   LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
 817   LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
 818   LOOP_VINFO_REDUCTION_CHAINS (res) = VEC_alloc (gimple, heap, 10);
 819   LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
 820   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 821   LOOP_VINFO_PEELING_HTAB (res) = NULL;
 822   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 823
 824   return res;
 825 }
 826
 827
 828 /* Function destroy_loop_vec_info.
 829
 830    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 831    stmts in the loop.  */
 832
 833 void
 834 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 835 {
 836   struct loop *loop;
 837   basic_block *bbs;
 838   int nbbs;
 839   gimple_stmt_iterator si;
 840   int j;
 841   VEC (slp_instance, heap) *slp_instances;
 842   slp_instance instance;
 843
 844   if (!loop_vinfo)
 845     return;
 846
 847   loop = LOOP_VINFO_LOOP (loop_vinfo);
 848
 849   bbs = LOOP_VINFO_BBS (loop_vinfo);
 850   nbbs = loop->num_nodes;
 851
 852   if (!clean_stmts)
 853     {
 854       free (LOOP_VINFO_BBS (loop_vinfo));
 855       free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 856       free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 857       VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
 858       VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 859       VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
 860
 861       free (loop_vinfo);
 862       loop->aux = NULL;
 863       return;
 864     }
 865
 866   for (j = 0; j < nbbs; j++)
 867     {
 868       basic_block bb = bbs[j];
 869       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 870         free_stmt_vec_info (gsi_stmt (si));
 871
 872       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 873         {
 874           gimple stmt = gsi_stmt (si);
 875           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 876
 877           if (stmt_info)
 878             {
 879               /* Check if this statement has a related "pattern stmt"
 880                  (introduced by the vectorizer during the pattern recognition
 881                  pass).  Free pattern's stmt_vec_info.  */
 882               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 883                   && vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info)))
 884                 free_stmt_vec_info (STMT_VINFO_RELATED_STMT (stmt_info));
 885
 886               /* Free stmt_vec_info.  */
 887               free_stmt_vec_info (stmt);
 888             }
 889
 890           gsi_next (&si);
 891         }
 892     }
 893
 894   free (LOOP_VINFO_BBS (loop_vinfo));
 895   free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 896   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 897   VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
 898   VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 899   VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
 900   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
 901   FOR_EACH_VEC_ELT (slp_instance, slp_instances, j, instance)
 902     vect_free_slp_instance (instance);
 903
 904   VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
 905   VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
 906   VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
 907   VEC_free (gimple, heap, LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo));
 908
 909   if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
 910     htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
 911
 912   free (loop_vinfo);
 913   loop->aux = NULL;
 914 }
 915
 916
 917 /* Function vect_analyze_loop_1.
 918
 919    Apply a set of analyses on LOOP, and create a loop_vec_info struct
 920    for it. The different analyses will record information in the
 921    loop_vec_info struct.  This is a subset of the analyses applied in
 922    vect_analyze_loop, to be applied on an inner-loop nested in the loop
 923    that is now considered for (outer-loop) vectorization.  */
 924
 925 static loop_vec_info
 926 vect_analyze_loop_1 (struct loop *loop)
 927 {
 928   loop_vec_info loop_vinfo;
 929
 930   if (vect_print_dump_info (REPORT_DETAILS))
 931     fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
 932
 933   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
 934
 935   loop_vinfo = vect_analyze_loop_form (loop);
 936   if (!loop_vinfo)
 937     {
 938       if (vect_print_dump_info (REPORT_DETAILS))
 939         fprintf (vect_dump, "bad inner-loop form.");
 940       return NULL;
 941     }
 942
 943   return loop_vinfo;
 944 }
 945
 946
 947 /* Function vect_analyze_loop_form.
 948
 949    Verify that certain CFG restrictions hold, including:
 950    - the loop has a pre-header
 951    - the loop has a single entry and exit
 952    - the loop exit condition is simple enough, and the number of iterations
 953      can be analyzed (a countable loop).  */
 954
 955 loop_vec_info
 956 vect_analyze_loop_form (struct loop *loop)
 957 {
 958   loop_vec_info loop_vinfo;
 959   gimple loop_cond;
 960   tree number_of_iterations = NULL;
 961   loop_vec_info inner_loop_vinfo = NULL;
 962
 963   if (vect_print_dump_info (REPORT_DETAILS))
 964     fprintf (vect_dump, "=== vect_analyze_loop_form ===");
 965
 966   /* Different restrictions apply when we are considering an inner-most loop,
 967      vs. an outer (nested) loop.
 968      (FORNOW. May want to relax some of these restrictions in the future).  */
 969
 970   if (!loop->inner)
 971     {
 972       /* Inner-most loop.  We currently require that the number of BBs is
 973          exactly 2 (the header and latch).  Vectorizable inner-most loops
 974          look like this:
 975
 976                         (pre-header)
 977                            |
 978                           header <--------+
 979                            | |            |
 980                            | +--> latch --+
 981                            |
 982                         (exit-bb)  */
 983
 984       if (loop->num_nodes != 2)
 985         {
 986           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 987             fprintf (vect_dump, "not vectorized: control flow in loop.");
 988           return NULL;
 989         }
 990
 991       if (empty_block_p (loop->header))
 992     {
 993           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 994             fprintf (vect_dump, "not vectorized: empty loop.");
 995       return NULL;
 996     }
 997     }
 998   else
 999     {
1000       struct loop *innerloop = loop->inner;
1001       edge entryedge;
1002
1003       /* Nested loop. We currently require that the loop is doubly-nested,
1004          contains a single inner loop, and the number of BBs is exactly 5.
1005          Vectorizable outer-loops look like this:
1006
1007                         (pre-header)
1008                            |
1009                           header <---+
1010                            |         |
1011                           inner-loop |
1012                            |         |
1013                           tail ------+
1014                            |
1015                         (exit-bb)
1016
1017          The inner-loop has the properties expected of inner-most loops
1018          as described above.  */
1019
1020       if ((loop->inner)->inner || (loop->inner)->next)
1021         {
1022           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1023             fprintf (vect_dump, "not vectorized: multiple nested loops.");
1024           return NULL;
1025         }
1026
1027       /* Analyze the inner-loop.  */
1028       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1029       if (!inner_loop_vinfo)
1030         {
1031           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1032             fprintf (vect_dump, "not vectorized: Bad inner loop.");
1033           return NULL;
1034         }
1035
1036       if (!expr_invariant_in_loop_p (loop,
1037                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1038         {
1039           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1040             fprintf (vect_dump,
1041                      "not vectorized: inner-loop count not invariant.");
1042           destroy_loop_vec_info (inner_loop_vinfo, true);
1043           return NULL;
1044         }
1045
1046       if (loop->num_nodes != 5)
1047         {
1048           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1049             fprintf (vect_dump, "not vectorized: control flow in loop.");
1050           destroy_loop_vec_info (inner_loop_vinfo, true);
1051           return NULL;
1052         }
1053
1054       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1055       entryedge = EDGE_PRED (innerloop->header, 0);
1056       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1057         entryedge = EDGE_PRED (innerloop->header, 1);
1058
1059       if (entryedge->src != loop->header
1060           || !single_exit (innerloop)
1061           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1062         {
1063           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1064             fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
1065           destroy_loop_vec_info (inner_loop_vinfo, true);
1066           return NULL;
1067         }
1068
1069       if (vect_print_dump_info (REPORT_DETAILS))
1070         fprintf (vect_dump, "Considering outer-loop vectorization.");
1071     }
1072
1073   if (!single_exit (loop)
1074       || EDGE_COUNT (loop->header->preds) != 2)
1075     {
1076       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1077         {
1078           if (!single_exit (loop))
1079             fprintf (vect_dump, "not vectorized: multiple exits.");
1080           else if (EDGE_COUNT (loop->header->preds) != 2)
1081             fprintf (vect_dump, "not vectorized: too many incoming edges.");
1082         }
1083       if (inner_loop_vinfo)
1084         destroy_loop_vec_info (inner_loop_vinfo, true);
1085       return NULL;
1086     }
1087
1088   /* We assume that the loop exit condition is at the end of the loop. i.e,
1089      that the loop is represented as a do-while (with a proper if-guard
1090      before the loop if needed), where the loop header contains all the
1091      executable statements, and the latch is empty.  */
1092   if (!empty_block_p (loop->latch)
1093         || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1094     {
1095       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1096         fprintf (vect_dump, "not vectorized: unexpected loop form.");
1097       if (inner_loop_vinfo)
1098         destroy_loop_vec_info (inner_loop_vinfo, true);
1099       return NULL;
1100     }
1101
1102   /* Make sure there exists a single-predecessor exit bb:  */
1103   if (!single_pred_p (single_exit (loop)->dest))
1104     {
1105       edge e = single_exit (loop);
1106       if (!(e->flags & EDGE_ABNORMAL))
1107         {
1108           split_loop_exit_edge (e);
1109           if (vect_print_dump_info (REPORT_DETAILS))
1110             fprintf (vect_dump, "split exit edge.");
1111         }
1112       else
1113         {
1114           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1115             fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1116           if (inner_loop_vinfo)
1117             destroy_loop_vec_info (inner_loop_vinfo, true);
1118           return NULL;
1119         }
1120     }
1121
1122   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1123   if (!loop_cond)
1124     {
1125       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1126         fprintf (vect_dump, "not vectorized: complicated exit condition.");
1127       if (inner_loop_vinfo)
1128         destroy_loop_vec_info (inner_loop_vinfo, true);
1129       return NULL;
1130     }
1131
1132   if (!number_of_iterations)
1133     {
1134       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1135         fprintf (vect_dump,
1136                  "not vectorized: number of iterations cannot be computed.");
1137       if (inner_loop_vinfo)
1138         destroy_loop_vec_info (inner_loop_vinfo, true);
1139       return NULL;
1140     }
1141
1142   if (chrec_contains_undetermined (number_of_iterations))
1143     {
1144       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1145         fprintf (vect_dump, "Infinite number of iterations.");
1146       if (inner_loop_vinfo)
1147         destroy_loop_vec_info (inner_loop_vinfo, true);
1148       return NULL;
1149     }
1150
1151   if (!NITERS_KNOWN_P (number_of_iterations))
1152     {
1153       if (vect_print_dump_info (REPORT_DETAILS))
1154         {
1155           fprintf (vect_dump, "Symbolic number of iterations is ");
1156           print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1157         }
1158     }
1159   else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1160     {
1161       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1162         fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1163       if (inner_loop_vinfo)
1164         destroy_loop_vec_info (inner_loop_vinfo, false);
1165       return NULL;
1166     }
1167
1168   loop_vinfo = new_loop_vec_info (loop);
1169   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1170   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1171
1172   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1173
1174   /* CHECKME: May want to keep it around it in the future.  */
1175   if (inner_loop_vinfo)
1176     destroy_loop_vec_info (inner_loop_vinfo, false);
1177
1178   gcc_assert (!loop->aux);
1179   loop->aux = loop_vinfo;
1180   return loop_vinfo;
1181 }
1182
1183
1184 /* Get cost by calling cost target builtin.  */
1185
1186 static inline int
1187 vect_get_cost (enum vect_cost_for_stmt type_of_cost)
1188 {
1189   tree dummy_type = NULL;
1190   int dummy = 0;
1191
1192   return targetm.vectorize.builtin_vectorization_cost (type_of_cost,
1193                                                        dummy_type, dummy);
1194 }
1195
1196
1197 /* Function vect_analyze_loop_operations.
1198
1199    Scan the loop stmts and make sure they are all vectorizable.  */
1200
1201 static bool
1202 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1203 {
1204   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1205   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1206   int nbbs = loop->num_nodes;
1207   gimple_stmt_iterator si;
1208   unsigned int vectorization_factor = 0;
1209   int i;
1210   gimple phi;
1211   stmt_vec_info stmt_info;
1212   bool need_to_vectorize = false;
1213   int min_profitable_iters;
1214   int min_scalar_loop_bound;
1215   unsigned int th;
1216   bool only_slp_in_loop = true, ok;
1217
1218   if (vect_print_dump_info (REPORT_DETAILS))
1219     fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1220
1221   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1222   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1223   if (slp)
1224     {
1225       /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1226          vectorization factor of the loop is the unrolling factor required by
1227          the SLP instances.  If that unrolling factor is 1, we say, that we
1228          perform pure SLP on loop - cross iteration parallelism is not
1229          exploited.  */
1230       for (i = 0; i < nbbs; i++)
1231         {
1232           basic_block bb = bbs[i];
1233           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1234             {
1235               gimple stmt = gsi_stmt (si);
1236               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1237               gcc_assert (stmt_info);
1238               if ((STMT_VINFO_RELEVANT_P (stmt_info)
1239                    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1240                   && !PURE_SLP_STMT (stmt_info))
1241                 /* STMT needs both SLP and loop-based vectorization.  */
1242                 only_slp_in_loop = false;
1243             }
1244         }
1245
1246       if (only_slp_in_loop)
1247         vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1248       else
1249         vectorization_factor = least_common_multiple (vectorization_factor,
1250                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1251
1252       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1253       if (vect_print_dump_info (REPORT_DETAILS))
1254         fprintf (vect_dump, "Updating vectorization factor to %d ",
1255                             vectorization_factor);
1256     }
1257
1258   for (i = 0; i < nbbs; i++)
1259     {
1260       basic_block bb = bbs[i];
1261
1262       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1263         {
1264           phi = gsi_stmt (si);
1265           ok = true;
1266
1267           stmt_info = vinfo_for_stmt (phi);
1268           if (vect_print_dump_info (REPORT_DETAILS))
1269             {
1270               fprintf (vect_dump, "examining phi: ");
1271               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1272             }
1273
1274           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1275              (i.e., a phi in the tail of the outer-loop).  */
1276           if (! is_loop_header_bb_p (bb))
1277             {
1278               /* FORNOW: we currently don't support the case that these phis
1279                  are not used in the outerloop (unless it is double reduction,
1280                  i.e., this phi is vect_reduction_def), cause this case
1281                  requires to actually do something here.  */
1282               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1283                    || STMT_VINFO_LIVE_P (stmt_info))
1284                   && STMT_VINFO_DEF_TYPE (stmt_info)
1285                      != vect_double_reduction_def)
1286                 {
1287                   if (vect_print_dump_info (REPORT_DETAILS))
1288                     fprintf (vect_dump,
1289                              "Unsupported loop-closed phi in outer-loop.");
1290                   return false;
1291                 }
1292
1293               /* If PHI is used in the outer loop, we check that its operand
1294                  is defined in the inner loop.  */
1295               if (STMT_VINFO_RELEVANT_P (stmt_info))
1296                 {
1297                   tree phi_op;
1298                   gimple op_def_stmt;
1299
1300                   if (gimple_phi_num_args (phi) != 1)
1301                     return false;
1302
1303                   phi_op = PHI_ARG_DEF (phi, 0);
1304                   if (TREE_CODE (phi_op) != SSA_NAME)
1305                     return false;
1306
1307                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1308                   if (!op_def_stmt || !vinfo_for_stmt (op_def_stmt))
1309                     return false;
1310
1311                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1312                         != vect_used_in_outer
1313                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1314                            != vect_used_in_outer_by_reduction)
1315                     return false;
1316                 }
1317
1318               continue;
1319             }
1320
1321           gcc_assert (stmt_info);
1322
1323           if (STMT_VINFO_LIVE_P (stmt_info))
1324             {
1325               /* FORNOW: not yet supported.  */
1326               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1327                 fprintf (vect_dump, "not vectorized: value used after loop.");
1328               return false;
1329             }
1330
1331           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1332               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1333             {
1334               /* A scalar-dependence cycle that we don't support.  */
1335               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1336                 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1337               return false;
1338             }
1339
1340           if (STMT_VINFO_RELEVANT_P (stmt_info))
1341             {
1342               need_to_vectorize = true;
1343               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1344                 ok = vectorizable_induction (phi, NULL, NULL);
1345             }
1346
1347           if (!ok)
1348             {
1349               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1350                 {
1351                   fprintf (vect_dump,
1352                            "not vectorized: relevant phi not supported: ");
1353                   print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1354                 }
1355               return false;
1356             }
1357         }
1358
1359       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1360         {
1361           gimple stmt = gsi_stmt (si);
1362           if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1363             return false;
1364         }
1365     } /* bbs */
1366
1367   /* All operations in the loop are either irrelevant (deal with loop
1368      control, or dead), or only used outside the loop and can be moved
1369      out of the loop (e.g. invariants, inductions).  The loop can be
1370      optimized away by scalar optimizations.  We're better off not
1371      touching this loop.  */
1372   if (!need_to_vectorize)
1373     {
1374       if (vect_print_dump_info (REPORT_DETAILS))
1375         fprintf (vect_dump,
1376                  "All the computation can be taken out of the loop.");
1377       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1378         fprintf (vect_dump,
1379                  "not vectorized: redundant loop. no profit to vectorize.");
1380       return false;
1381     }
1382
1383   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1384       && vect_print_dump_info (REPORT_DETAILS))
1385     fprintf (vect_dump,
1386         "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1387         vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1388
1389   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1390       && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1391     {
1392       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1393         fprintf (vect_dump, "not vectorized: iteration count too small.");
1394       if (vect_print_dump_info (REPORT_DETAILS))
1395         fprintf (vect_dump,"not vectorized: iteration count smaller than "
1396                  "vectorization factor.");
1397       return false;
1398     }
1399
1400   /* Analyze cost.  Decide if worth while to vectorize.  */
1401
1402   /* Once VF is set, SLP costs should be updated since the number of created
1403      vector stmts depends on VF.  */
1404   vect_update_slp_costs_according_to_vf (loop_vinfo);
1405
1406   min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1407   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1408
1409   if (min_profitable_iters < 0)
1410     {
1411       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1412         fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1413       if (vect_print_dump_info (REPORT_DETAILS))
1414         fprintf (vect_dump, "not vectorized: vector version will never be "
1415                  "profitable.");
1416       return false;
1417     }
1418
1419   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1420                             * vectorization_factor) - 1);
1421
1422   /* Use the cost model only if it is more conservative than user specified
1423      threshold.  */
1424
1425   th = (unsigned) min_scalar_loop_bound;
1426   if (min_profitable_iters
1427       && (!min_scalar_loop_bound
1428           || min_profitable_iters > min_scalar_loop_bound))
1429     th = (unsigned) min_profitable_iters;
1430
1431   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1432       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1433     {
1434       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1435         fprintf (vect_dump, "not vectorized: vectorization not "
1436                  "profitable.");
1437       if (vect_print_dump_info (REPORT_DETAILS))
1438         fprintf (vect_dump, "not vectorized: iteration count smaller than "
1439                  "user specified loop bound parameter or minimum "
1440                  "profitable iterations (whichever is more conservative).");
1441       return false;
1442     }
1443
1444   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1445       || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1446       || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1447     {
1448       if (vect_print_dump_info (REPORT_DETAILS))
1449         fprintf (vect_dump, "epilog loop required.");
1450       if (!vect_can_advance_ivs_p (loop_vinfo))
1451         {
1452           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1453             fprintf (vect_dump,
1454                      "not vectorized: can't create epilog loop 1.");
1455           return false;
1456         }
1457       if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1458         {
1459           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1460             fprintf (vect_dump,
1461                      "not vectorized: can't create epilog loop 2.");
1462           return false;
1463         }
1464     }
1465
1466   return true;
1467 }
1468
1469
1470 /* Function vect_analyze_loop_2.
1471
1472    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1473    for it.  The different analyses will record information in the
1474    loop_vec_info struct.  */
1475 static bool
1476 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1477 {
1478   bool ok, slp = false;
1479   int max_vf = MAX_VECTORIZATION_FACTOR;
1480   int min_vf = 2;
1481
1482   /* Find all data references in the loop (which correspond to vdefs/vuses)
1483      and analyze their evolution in the loop.  Also adjust the minimal
1484      vectorization factor according to the loads and stores.
1485
1486      FORNOW: Handle only simple, array references, which
1487      alignment can be forced, and aligned pointer-references.  */
1488
1489   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1490   if (!ok)
1491     {
1492       if (vect_print_dump_info (REPORT_DETAILS))
1493         fprintf (vect_dump, "bad data references.");
1494       return false;
1495     }
1496
1497   /* Classify all cross-iteration scalar data-flow cycles.
1498      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1499
1500   vect_analyze_scalar_cycles (loop_vinfo);
1501
1502   vect_pattern_recog (loop_vinfo);
1503
1504   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1505
1506   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1507   if (!ok)
1508     {
1509       if (vect_print_dump_info (REPORT_DETAILS))
1510         fprintf (vect_dump, "unexpected pattern.");
1511       return false;
1512     }
1513
1514   /* Analyze data dependences between the data-refs in the loop
1515      and adjust the maximum vectorization factor according to
1516      the dependences.
1517      FORNOW: fail at the first data dependence that we encounter.  */
1518
1519   ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf);
1520   if (!ok
1521       || max_vf < min_vf)
1522     {
1523       if (vect_print_dump_info (REPORT_DETAILS))
1524         fprintf (vect_dump, "bad data dependence.");
1525       return false;
1526     }
1527
1528   ok = vect_determine_vectorization_factor (loop_vinfo);
1529   if (!ok)
1530     {
1531       if (vect_print_dump_info (REPORT_DETAILS))
1532         fprintf (vect_dump, "can't determine vectorization factor.");
1533       return false;
1534     }
1535   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1536     {
1537       if (vect_print_dump_info (REPORT_DETAILS))
1538         fprintf (vect_dump, "bad data dependence.");
1539       return false;
1540     }
1541
1542   /* Analyze the alignment of the data-refs in the loop.
1543      Fail if a data reference is found that cannot be vectorized.  */
1544
1545   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1546   if (!ok)
1547     {
1548       if (vect_print_dump_info (REPORT_DETAILS))
1549         fprintf (vect_dump, "bad data alignment.");
1550       return false;
1551     }
1552
1553   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1554      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1555
1556   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1557   if (!ok)
1558     {
1559       if (vect_print_dump_info (REPORT_DETAILS))
1560         fprintf (vect_dump, "bad data access.");
1561       return false;
1562     }
1563
1564   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1565      It is important to call pruning after vect_analyze_data_ref_accesses,
1566      since we use grouping information gathered by interleaving analysis.  */
1567   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1568   if (!ok)
1569     {
1570       if (vect_print_dump_info (REPORT_DETAILS))
1571         fprintf (vect_dump, "too long list of versioning for alias "
1572                             "run-time tests.");
1573       return false;
1574     }
1575
1576   /* This pass will decide on using loop versioning and/or loop peeling in
1577      order to enhance the alignment of data references in the loop.  */
1578
1579   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1580   if (!ok)
1581     {
1582       if (vect_print_dump_info (REPORT_DETAILS))
1583         fprintf (vect_dump, "bad data alignment.");
1584       return false;
1585     }
1586
1587   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1588   ok = vect_analyze_slp (loop_vinfo, NULL);
1589   if (ok)
1590     {
1591       /* Decide which possible SLP instances to SLP.  */
1592       slp = vect_make_slp_decision (loop_vinfo);
1593
1594       /* Find stmts that need to be both vectorized and SLPed.  */
1595       vect_detect_hybrid_slp (loop_vinfo);
1596     }
1597   else
1598     return false;
1599
1600   /* Scan all the operations in the loop and make sure they are
1601      vectorizable.  */
1602
1603   ok = vect_analyze_loop_operations (loop_vinfo, slp);
1604   if (!ok)
1605     {
1606       if (vect_print_dump_info (REPORT_DETAILS))
1607         fprintf (vect_dump, "bad operation or unsupported loop bound.");
1608       return false;
1609     }
1610
1611   return true;
1612 }
1613
1614 /* Function vect_analyze_loop.
1615
1616    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1617    for it.  The different analyses will record information in the
1618    loop_vec_info struct.  */
1619 loop_vec_info
1620 vect_analyze_loop (struct loop *loop)
1621 {
1622   loop_vec_info loop_vinfo;
1623   unsigned int vector_sizes;
1624
1625   /* Autodetect first vector size we try.  */
1626   current_vector_size = 0;
1627   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1628
1629   if (vect_print_dump_info (REPORT_DETAILS))
1630     fprintf (vect_dump, "===== analyze_loop_nest =====");
1631
1632   if (loop_outer (loop)
1633       && loop_vec_info_for_loop (loop_outer (loop))
1634       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1635     {
1636       if (vect_print_dump_info (REPORT_DETAILS))
1637         fprintf (vect_dump, "outer-loop already vectorized.");
1638       return NULL;
1639     }
1640
1641   while (1)
1642     {
1643       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1644       loop_vinfo = vect_analyze_loop_form (loop);
1645       if (!loop_vinfo)
1646         {
1647           if (vect_print_dump_info (REPORT_DETAILS))
1648             fprintf (vect_dump, "bad loop form.");
1649           return NULL;
1650         }
1651
1652       if (vect_analyze_loop_2 (loop_vinfo))
1653         {
1654           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1655
1656           return loop_vinfo;
1657         }
1658
1659       destroy_loop_vec_info (loop_vinfo, true);
1660
1661       vector_sizes &= ~current_vector_size;
1662       if (vector_sizes == 0
1663           || current_vector_size == 0)
1664         return NULL;
1665
1666       /* Try the next biggest vector size.  */
1667       current_vector_size = 1 << floor_log2 (vector_sizes);
1668       if (vect_print_dump_info (REPORT_DETAILS))
1669         fprintf (vect_dump, "***** Re-trying analysis with "
1670                  "vector size %d\n", current_vector_size);
1671     }
1672 }
1673
1674
1675 /* Function reduction_code_for_scalar_code
1676
1677    Input:
1678    CODE - tree_code of a reduction operations.
1679
1680    Output:
1681    REDUC_CODE - the corresponding tree-code to be used to reduce the
1682       vector of partial results into a single scalar result (which
1683       will also reside in a vector) or ERROR_MARK if the operation is
1684       a supported reduction operation, but does not have such tree-code.
1685
1686    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1687
1688 static bool
1689 reduction_code_for_scalar_code (enum tree_code code,
1690                                 enum tree_code *reduc_code)
1691 {
1692   switch (code)
1693     {
1694       case MAX_EXPR:
1695         *reduc_code = REDUC_MAX_EXPR;
1696         return true;
1697
1698       case MIN_EXPR:
1699         *reduc_code = REDUC_MIN_EXPR;
1700         return true;
1701
1702       case PLUS_EXPR:
1703         *reduc_code = REDUC_PLUS_EXPR;
1704         return true;
1705
1706       case MULT_EXPR:
1707       case MINUS_EXPR:
1708       case BIT_IOR_EXPR:
1709       case BIT_XOR_EXPR:
1710       case BIT_AND_EXPR:
1711         *reduc_code = ERROR_MARK;
1712         return true;
1713
1714       default:
1715        return false;
1716     }
1717 }
1718
1719
1720 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1721    STMT is printed with a message MSG. */
1722
1723 static void
1724 report_vect_op (gimple stmt, const char *msg)
1725 {
1726   fprintf (vect_dump, "%s", msg);
1727   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1728 }
1729
1730
1731 /* Detect SLP reduction of the form:
1732
1733    #a1 = phi <a5, a0>
1734    a2 = operation (a1)
1735    a3 = operation (a2)
1736    a4 = operation (a3)
1737    a5 = operation (a4)
1738
1739    #a = phi <a5>
1740
1741    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1742    FIRST_STMT is the first reduction stmt in the chain
1743    (a2 = operation (a1)).
1744
1745    Return TRUE if a reduction chain was detected.  */
1746
1747 static bool
1748 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1749 {
1750   struct loop *loop = (gimple_bb (phi))->loop_father;
1751   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1752   enum tree_code code;
1753   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1754   stmt_vec_info use_stmt_info, current_stmt_info;
1755   tree lhs;
1756   imm_use_iterator imm_iter;
1757   use_operand_p use_p;
1758   int nloop_uses, size = 0, n_out_of_loop_uses;
1759   bool found = false;
1760
1761   if (loop != vect_loop)
1762     return false;
1763
1764   lhs = PHI_RESULT (phi);
1765   code = gimple_assign_rhs_code (first_stmt);
1766   while (1)
1767     {
1768       nloop_uses = 0;
1769       n_out_of_loop_uses = 0;
1770       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1771         {
1772           gimple use_stmt = USE_STMT (use_p);
1773           if (is_gimple_debug (use_stmt))
1774             continue;
1775
1776           use_stmt = USE_STMT (use_p);
1777
1778           /* Check if we got back to the reduction phi.  */
1779           if (use_stmt == phi)
1780             {
1781               loop_use_stmt = use_stmt;
1782               found = true;
1783               break;
1784             }
1785
1786           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1787             {
1788               if (vinfo_for_stmt (use_stmt)
1789                   && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1790                 {
1791                   loop_use_stmt = use_stmt;
1792                   nloop_uses++;
1793                 }
1794             }
1795            else
1796              n_out_of_loop_uses++;
1797
1798            /* There are can be either a single use in the loop or two uses in
1799               phi nodes.  */
1800            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1801              return false;
1802         }
1803
1804       if (found)
1805         break;
1806
1807       /* We reached a statement with no loop uses.  */
1808       if (nloop_uses == 0)
1809         return false;
1810
1811       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
1812       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1813         return false;
1814
1815       if (!is_gimple_assign (loop_use_stmt)
1816           || code != gimple_assign_rhs_code (loop_use_stmt)
1817           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1818         return false;
1819
1820       /* Insert USE_STMT into reduction chain.  */
1821       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
1822       if (current_stmt)
1823         {
1824           current_stmt_info = vinfo_for_stmt (current_stmt);
1825           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
1826           GROUP_FIRST_ELEMENT (use_stmt_info)
1827             = GROUP_FIRST_ELEMENT (current_stmt_info);
1828         }
1829       else
1830         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
1831
1832       lhs = gimple_assign_lhs (loop_use_stmt);
1833       current_stmt = loop_use_stmt;
1834       size++;
1835    }
1836
1837   if (!found || loop_use_stmt != phi || size < 2)
1838     return false;
1839
1840   /* Swap the operands, if needed, to make the reduction operand be the second
1841      operand.  */
1842   lhs = PHI_RESULT (phi);
1843   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1844   while (next_stmt)
1845     {
1846       if (gimple_assign_rhs2 (next_stmt) == lhs)
1847         {
1848           tree op = gimple_assign_rhs1 (next_stmt);
1849           gimple def_stmt = NULL;
1850
1851           if (TREE_CODE (op) == SSA_NAME)
1852             def_stmt = SSA_NAME_DEF_STMT (op);
1853
1854           /* Check that the other def is either defined in the loop
1855              ("vect_internal_def"), or it's an induction (defined by a
1856              loop-header phi-node).  */
1857           if (def_stmt
1858               && gimple_bb (def_stmt)
1859               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1860               && (is_gimple_assign (def_stmt)
1861                   || is_gimple_call (def_stmt)
1862                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1863                            == vect_induction_def
1864                   || (gimple_code (def_stmt) == GIMPLE_PHI
1865                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1866                                   == vect_internal_def
1867                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1868             {
1869               lhs = gimple_assign_lhs (next_stmt);
1870               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1871               continue;
1872             }
1873
1874           return false;
1875         }
1876       else
1877         {
1878           tree op = gimple_assign_rhs2 (next_stmt);
1879           gimple def_stmt = NULL;
1880
1881           if (TREE_CODE (op) == SSA_NAME)
1882             def_stmt = SSA_NAME_DEF_STMT (op);
1883
1884           /* Check that the other def is either defined in the loop
1885             ("vect_internal_def"), or it's an induction (defined by a
1886             loop-header phi-node).  */
1887           if (def_stmt
1888               && gimple_bb (def_stmt)
1889               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1890               && (is_gimple_assign (def_stmt)
1891                   || is_gimple_call (def_stmt)
1892                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1893                               == vect_induction_def
1894                   || (gimple_code (def_stmt) == GIMPLE_PHI
1895                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1896                                   == vect_internal_def
1897                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1898             {
1899               if (vect_print_dump_info (REPORT_DETAILS))
1900                 {
1901                   fprintf (vect_dump, "swapping oprnds: ");
1902                   print_gimple_stmt (vect_dump, next_stmt, 0, TDF_SLIM);
1903                 }
1904
1905               swap_tree_operands (next_stmt,
1906                                   gimple_assign_rhs1_ptr (next_stmt),
1907                                   gimple_assign_rhs2_ptr (next_stmt));
1908               mark_symbols_for_renaming (next_stmt);
1909             }
1910           else
1911             return false;
1912         }
1913
1914       lhs = gimple_assign_lhs (next_stmt);
1915       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1916     }
1917
1918   /* Save the chain for further analysis in SLP detection.  */
1919   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1920   VEC_safe_push (gimple, heap, LOOP_VINFO_REDUCTION_CHAINS (loop_info), first);
1921   GROUP_SIZE (vinfo_for_stmt (first)) = size;
1922
1923   return true;
1924 }
1925
1926
1927 /* Function vect_is_simple_reduction_1
1928
1929    (1) Detect a cross-iteration def-use cycle that represents a simple
1930    reduction computation.  We look for the following pattern:
1931
1932    loop_header:
1933      a1 = phi < a0, a2 >
1934      a3 = ...
1935      a2 = operation (a3, a1)
1936
1937    such that:
1938    1. operation is commutative and associative and it is safe to
1939       change the order of the computation (if CHECK_REDUCTION is true)
1940    2. no uses for a2 in the loop (a2 is used out of the loop)
1941    3. no uses of a1 in the loop besides the reduction operation
1942    4. no uses of a1 outside the loop.
1943
1944    Conditions 1,4 are tested here.
1945    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1946
1947    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1948    nested cycles, if CHECK_REDUCTION is false.
1949
1950    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1951    reductions:
1952
1953      a1 = phi < a0, a2 >
1954      inner loop (def of a3)
1955      a2 = phi < a3 >
1956
1957    If MODIFY is true it tries also to rework the code in-place to enable
1958    detection of more reduction patterns.  For the time being we rewrite
1959    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
1960 */
1961
1962 static gimple
1963 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
1964                             bool check_reduction, bool *double_reduc,
1965                             bool modify)
1966 {
1967   struct loop *loop = (gimple_bb (phi))->loop_father;
1968   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1969   edge latch_e = loop_latch_edge (loop);
1970   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1971   gimple def_stmt, def1 = NULL, def2 = NULL;
1972   enum tree_code orig_code, code;
1973   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1974   tree type;
1975   int nloop_uses;
1976   tree name;
1977   imm_use_iterator imm_iter;
1978   use_operand_p use_p;
1979   bool phi_def;
1980
1981   *double_reduc = false;
1982
1983   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1984      otherwise, we assume outer loop vectorization.  */
1985   gcc_assert ((check_reduction && loop == vect_loop)
1986               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1987
1988   name = PHI_RESULT (phi);
1989   nloop_uses = 0;
1990   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1991     {
1992       gimple use_stmt = USE_STMT (use_p);
1993       if (is_gimple_debug (use_stmt))
1994         continue;
1995
1996       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1997         {
1998           if (vect_print_dump_info (REPORT_DETAILS))
1999             fprintf (vect_dump, "intermediate value used outside loop.");
2000
2001           return NULL;
2002         }
2003
2004       if (vinfo_for_stmt (use_stmt)
2005           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2006         nloop_uses++;
2007       if (nloop_uses > 1)
2008         {
2009           if (vect_print_dump_info (REPORT_DETAILS))
2010             fprintf (vect_dump, "reduction used in loop.");
2011           return NULL;
2012         }
2013     }
2014
2015   if (TREE_CODE (loop_arg) != SSA_NAME)
2016     {
2017       if (vect_print_dump_info (REPORT_DETAILS))
2018         {
2019           fprintf (vect_dump, "reduction: not ssa_name: ");
2020           print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
2021         }
2022       return NULL;
2023     }
2024
2025   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2026   if (!def_stmt)
2027     {
2028       if (vect_print_dump_info (REPORT_DETAILS))
2029         fprintf (vect_dump, "reduction: no def_stmt.");
2030       return NULL;
2031     }
2032
2033   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2034     {
2035       if (vect_print_dump_info (REPORT_DETAILS))
2036         print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
2037       return NULL;
2038     }
2039
2040   if (is_gimple_assign (def_stmt))
2041     {
2042       name = gimple_assign_lhs (def_stmt);
2043       phi_def = false;
2044     }
2045   else
2046     {
2047       name = PHI_RESULT (def_stmt);
2048       phi_def = true;
2049     }
2050
2051   nloop_uses = 0;
2052   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2053     {
2054       gimple use_stmt = USE_STMT (use_p);
2055       if (is_gimple_debug (use_stmt))
2056         continue;
2057       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2058           && vinfo_for_stmt (use_stmt)
2059           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2060         nloop_uses++;
2061       if (nloop_uses > 1)
2062         {
2063           if (vect_print_dump_info (REPORT_DETAILS))
2064             fprintf (vect_dump, "reduction used in loop.");
2065           return NULL;
2066         }
2067     }
2068
2069   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2070      defined in the inner loop.  */
2071   if (phi_def)
2072     {
2073       op1 = PHI_ARG_DEF (def_stmt, 0);
2074
2075       if (gimple_phi_num_args (def_stmt) != 1
2076           || TREE_CODE (op1) != SSA_NAME)
2077         {
2078           if (vect_print_dump_info (REPORT_DETAILS))
2079             fprintf (vect_dump, "unsupported phi node definition.");
2080
2081           return NULL;
2082         }
2083
2084       def1 = SSA_NAME_DEF_STMT (op1);
2085       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2086           && loop->inner
2087           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2088           && is_gimple_assign (def1))
2089         {
2090           if (vect_print_dump_info (REPORT_DETAILS))
2091             report_vect_op (def_stmt, "detected double reduction: ");
2092
2093           *double_reduc = true;
2094           return def_stmt;
2095         }
2096
2097       return NULL;
2098     }
2099
2100   code = orig_code = gimple_assign_rhs_code (def_stmt);
2101
2102   /* We can handle "res -= x[i]", which is non-associative by
2103      simply rewriting this into "res += -x[i]".  Avoid changing
2104      gimple instruction for the first simple tests and only do this
2105      if we're allowed to change code at all.  */
2106   if (code == MINUS_EXPR
2107       && modify
2108       && (op1 = gimple_assign_rhs1 (def_stmt))
2109       && TREE_CODE (op1) == SSA_NAME
2110       && SSA_NAME_DEF_STMT (op1) == phi)
2111     code = PLUS_EXPR;
2112
2113   if (check_reduction
2114       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2115     {
2116       if (vect_print_dump_info (REPORT_DETAILS))
2117         report_vect_op (def_stmt, "reduction: not commutative/associative: ");
2118       return NULL;
2119     }
2120
2121   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2122     {
2123       if (code != COND_EXPR)
2124         {
2125           if (vect_print_dump_info (REPORT_DETAILS))
2126             report_vect_op (def_stmt, "reduction: not binary operation: ");
2127
2128           return NULL;
2129         }
2130
2131       op3 = gimple_assign_rhs1 (def_stmt);
2132       if (COMPARISON_CLASS_P (op3))
2133         {
2134           op4 = TREE_OPERAND (op3, 1);
2135           op3 = TREE_OPERAND (op3, 0);
2136         }
2137
2138       op1 = gimple_assign_rhs2 (def_stmt);
2139       op2 = gimple_assign_rhs3 (def_stmt);
2140
2141       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2142         {
2143           if (vect_print_dump_info (REPORT_DETAILS))
2144             report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
2145
2146           return NULL;
2147         }
2148     }
2149   else
2150     {
2151       op1 = gimple_assign_rhs1 (def_stmt);
2152       op2 = gimple_assign_rhs2 (def_stmt);
2153
2154       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2155         {
2156           if (vect_print_dump_info (REPORT_DETAILS))
2157             report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
2158
2159           return NULL;
2160         }
2161    }
2162
2163   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2164   if ((TREE_CODE (op1) == SSA_NAME
2165        && !types_compatible_p (type,TREE_TYPE (op1)))
2166       || (TREE_CODE (op2) == SSA_NAME
2167           && !types_compatible_p (type, TREE_TYPE (op2)))
2168       || (op3 && TREE_CODE (op3) == SSA_NAME
2169           && !types_compatible_p (type, TREE_TYPE (op3)))
2170       || (op4 && TREE_CODE (op4) == SSA_NAME
2171           && !types_compatible_p (type, TREE_TYPE (op4))))
2172     {
2173       if (vect_print_dump_info (REPORT_DETAILS))
2174         {
2175           fprintf (vect_dump, "reduction: multiple types: operation type: ");
2176           print_generic_expr (vect_dump, type, TDF_SLIM);
2177           fprintf (vect_dump, ", operands types: ");
2178           print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
2179           fprintf (vect_dump, ",");
2180           print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
2181           if (op3)
2182             {
2183               fprintf (vect_dump, ",");
2184               print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
2185             }
2186
2187           if (op4)
2188             {
2189               fprintf (vect_dump, ",");
2190               print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
2191             }
2192         }
2193
2194       return NULL;
2195     }
2196
2197   /* Check that it's ok to change the order of the computation.
2198      Generally, when vectorizing a reduction we change the order of the
2199      computation.  This may change the behavior of the program in some
2200      cases, so we need to check that this is ok.  One exception is when
2201      vectorizing an outer-loop: the inner-loop is executed sequentially,
2202      and therefore vectorizing reductions in the inner-loop during
2203      outer-loop vectorization is safe.  */
2204
2205   /* CHECKME: check for !flag_finite_math_only too?  */
2206   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2207       && check_reduction)
2208     {
2209       /* Changing the order of operations changes the semantics.  */
2210       if (vect_print_dump_info (REPORT_DETAILS))
2211         report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
2212       return NULL;
2213     }
2214   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2215            && check_reduction)
2216     {
2217       /* Changing the order of operations changes the semantics.  */
2218       if (vect_print_dump_info (REPORT_DETAILS))
2219         report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
2220       return NULL;
2221     }
2222   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2223     {
2224       /* Changing the order of operations changes the semantics.  */
2225       if (vect_print_dump_info (REPORT_DETAILS))
2226         report_vect_op (def_stmt,
2227                         "reduction: unsafe fixed-point math optimization: ");
2228       return NULL;
2229     }
2230
2231   /* If we detected "res -= x[i]" earlier, rewrite it into
2232      "res += -x[i]" now.  If this turns out to be useless reassoc
2233      will clean it up again.  */
2234   if (orig_code == MINUS_EXPR)
2235     {
2236       tree rhs = gimple_assign_rhs2 (def_stmt);
2237       tree negrhs = make_ssa_name (SSA_NAME_VAR (rhs), NULL);
2238       gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2239                                                          rhs, NULL);
2240       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2241       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2242                                                           loop_info, NULL));
2243       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2244       gimple_assign_set_rhs2 (def_stmt, negrhs);
2245       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2246       update_stmt (def_stmt);
2247     }
2248
2249   /* Reduction is safe. We're dealing with one of the following:
2250      1) integer arithmetic and no trapv
2251      2) floating point arithmetic, and special flags permit this optimization
2252      3) nested cycle (i.e., outer loop vectorization).  */
2253   if (TREE_CODE (op1) == SSA_NAME)
2254     def1 = SSA_NAME_DEF_STMT (op1);
2255
2256   if (TREE_CODE (op2) == SSA_NAME)
2257     def2 = SSA_NAME_DEF_STMT (op2);
2258
2259   if (code != COND_EXPR
2260       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2261     {
2262       if (vect_print_dump_info (REPORT_DETAILS))
2263         report_vect_op (def_stmt, "reduction: no defs for operands: ");
2264       return NULL;
2265     }
2266
2267   /* Check that one def is the reduction def, defined by PHI,
2268      the other def is either defined in the loop ("vect_internal_def"),
2269      or it's an induction (defined by a loop-header phi-node).  */
2270
2271   if (def2 && def2 == phi
2272       && (code == COND_EXPR
2273           || !def1 || gimple_nop_p (def1)
2274           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2275               && (is_gimple_assign (def1)
2276                   || is_gimple_call (def1)
2277                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2278                       == vect_induction_def
2279                   || (gimple_code (def1) == GIMPLE_PHI
2280                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2281                           == vect_internal_def
2282                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2283     {
2284       if (vect_print_dump_info (REPORT_DETAILS))
2285         report_vect_op (def_stmt, "detected reduction: ");
2286       return def_stmt;
2287     }
2288
2289   if (def1 && def1 == phi
2290       && (code == COND_EXPR
2291           || !def2 || gimple_nop_p (def2)
2292           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2293               && (is_gimple_assign (def2)
2294                   || is_gimple_call (def2)
2295                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2296                       == vect_induction_def
2297                   || (gimple_code (def2) == GIMPLE_PHI
2298                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2299                           == vect_internal_def
2300                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2301     {
2302       if (check_reduction)
2303         {
2304           /* Swap operands (just for simplicity - so that the rest of the code
2305              can assume that the reduction variable is always the last (second)
2306              argument).  */
2307           if (vect_print_dump_info (REPORT_DETAILS))
2308             report_vect_op (def_stmt,
2309                             "detected reduction: need to swap operands: ");
2310
2311           swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2312                               gimple_assign_rhs2_ptr (def_stmt));
2313         }
2314       else
2315         {
2316           if (vect_print_dump_info (REPORT_DETAILS))
2317             report_vect_op (def_stmt, "detected reduction: ");
2318         }
2319
2320       return def_stmt;
2321     }
2322
2323   /* Try to find SLP reduction chain.  */
2324   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2325     {
2326       if (vect_print_dump_info (REPORT_DETAILS))
2327         report_vect_op (def_stmt, "reduction: detected reduction chain: ");
2328
2329       return def_stmt;
2330     }
2331
2332   if (vect_print_dump_info (REPORT_DETAILS))
2333     report_vect_op (def_stmt, "reduction: unknown pattern: ");
2334
2335   return NULL;
2336 }
2337
2338 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2339    in-place.  Arguments as there.  */
2340
2341 static gimple
2342 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2343                           bool check_reduction, bool *double_reduc)
2344 {
2345   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2346                                      double_reduc, false);
2347 }
2348
2349 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2350    in-place if it enables detection of more reductions.  Arguments
2351    as there.  */
2352
2353 gimple
2354 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2355                           bool check_reduction, bool *double_reduc)
2356 {
2357   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2358                                      double_reduc, true);
2359 }
2360
2361 /* Calculate the cost of one scalar iteration of the loop.  */
2362 int
2363 vect_get_single_scalar_iteraion_cost (loop_vec_info loop_vinfo)
2364 {
2365   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2366   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2367   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2368   int innerloop_iters, i, stmt_cost;
2369
2370   /* Count statements in scalar loop.  Using this as scalar cost for a single
2371      iteration for now.
2372
2373      TODO: Add outer loop support.
2374
2375      TODO: Consider assigning different costs to different scalar
2376      statements.  */
2377
2378   /* FORNOW.  */
2379   innerloop_iters = 1;
2380   if (loop->inner)
2381     innerloop_iters = 50; /* FIXME */
2382
2383   for (i = 0; i < nbbs; i++)
2384     {
2385       gimple_stmt_iterator si;
2386       basic_block bb = bbs[i];
2387
2388       if (bb->loop_father == loop->inner)
2389         factor = innerloop_iters;
2390       else
2391         factor = 1;
2392
2393       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2394         {
2395           gimple stmt = gsi_stmt (si);
2396           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2397
2398           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2399             continue;
2400
2401           /* Skip stmts that are not vectorized inside the loop.  */
2402           if (stmt_info
2403               && !STMT_VINFO_RELEVANT_P (stmt_info)
2404               && (!STMT_VINFO_LIVE_P (stmt_info)
2405                   || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2406             continue;
2407
2408           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2409             {
2410               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2411                stmt_cost = vect_get_cost (scalar_load);
2412              else
2413                stmt_cost = vect_get_cost (scalar_store);
2414             }
2415           else
2416             stmt_cost = vect_get_cost (scalar_stmt);
2417
2418           scalar_single_iter_cost += stmt_cost * factor;
2419         }
2420     }
2421   return scalar_single_iter_cost;
2422 }
2423
2424 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2425 int
2426 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2427                              int *peel_iters_epilogue,
2428                              int scalar_single_iter_cost)
2429 {
2430   int peel_guard_costs = 0;
2431   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2432
2433   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2434     {
2435       *peel_iters_epilogue = vf/2;
2436       if (vect_print_dump_info (REPORT_COST))
2437         fprintf (vect_dump, "cost model: "
2438                             "epilogue peel iters set to vf/2 because "
2439                             "loop iterations are unknown .");
2440
2441       /* If peeled iterations are known but number of scalar loop
2442          iterations are unknown, count a taken branch per peeled loop.  */
2443       peel_guard_costs =  2 * vect_get_cost (cond_branch_taken);
2444     }
2445   else
2446     {
2447       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2448       peel_iters_prologue = niters < peel_iters_prologue ?
2449                             niters : peel_iters_prologue;
2450       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2451       /* If we need to peel for gaps, but no peeling is required, we have to
2452          peel VF iterations.  */
2453       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2454         *peel_iters_epilogue = vf;
2455     }
2456
2457    return (peel_iters_prologue * scalar_single_iter_cost)
2458             + (*peel_iters_epilogue * scalar_single_iter_cost)
2459            + peel_guard_costs;
2460 }
2461
2462 /* Function vect_estimate_min_profitable_iters
2463
2464    Return the number of iterations required for the vector version of the
2465    loop to be profitable relative to the cost of the scalar version of the
2466    loop.
2467
2468    TODO: Take profile info into account before making vectorization
2469    decisions, if available.  */
2470
2471 int
2472 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
2473 {
2474   int i;
2475   int min_profitable_iters;
2476   int peel_iters_prologue;
2477   int peel_iters_epilogue;
2478   int vec_inside_cost = 0;
2479   int vec_outside_cost = 0;
2480   int scalar_single_iter_cost = 0;
2481   int scalar_outside_cost = 0;
2482   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2483   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2484   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2485   int nbbs = loop->num_nodes;
2486   int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2487   int peel_guard_costs = 0;
2488   int innerloop_iters = 0, factor;
2489   VEC (slp_instance, heap) *slp_instances;
2490   slp_instance instance;
2491
2492   /* Cost model disabled.  */
2493   if (!flag_vect_cost_model)
2494     {
2495       if (vect_print_dump_info (REPORT_COST))
2496         fprintf (vect_dump, "cost model disabled.");
2497       return 0;
2498     }
2499
2500   /* Requires loop versioning tests to handle misalignment.  */
2501   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2502     {
2503       /*  FIXME: Make cost depend on complexity of individual check.  */
2504       vec_outside_cost +=
2505         VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
2506       if (vect_print_dump_info (REPORT_COST))
2507         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2508                  "versioning to treat misalignment.\n");
2509     }
2510
2511   /* Requires loop versioning with alias checks.  */
2512   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2513     {
2514       /*  FIXME: Make cost depend on complexity of individual check.  */
2515       vec_outside_cost +=
2516         VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
2517       if (vect_print_dump_info (REPORT_COST))
2518         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2519                  "versioning aliasing.\n");
2520     }
2521
2522   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2523       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2524     vec_outside_cost += vect_get_cost (cond_branch_taken);
2525
2526   /* Count statements in scalar loop.  Using this as scalar cost for a single
2527      iteration for now.
2528
2529      TODO: Add outer loop support.
2530
2531      TODO: Consider assigning different costs to different scalar
2532      statements.  */
2533
2534   /* FORNOW.  */
2535   if (loop->inner)
2536     innerloop_iters = 50; /* FIXME */
2537
2538   for (i = 0; i < nbbs; i++)
2539     {
2540       gimple_stmt_iterator si;
2541       basic_block bb = bbs[i];
2542
2543       if (bb->loop_father == loop->inner)
2544         factor = innerloop_iters;
2545       else
2546         factor = 1;
2547
2548       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2549         {
2550           gimple stmt = gsi_stmt (si);
2551           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2552           /* Skip stmts that are not vectorized inside the loop.  */
2553           if (!STMT_VINFO_RELEVANT_P (stmt_info)
2554               && (!STMT_VINFO_LIVE_P (stmt_info)
2555                   || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2556             continue;
2557           vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
2558           /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
2559              some of the "outside" costs are generated inside the outer-loop.  */
2560           vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
2561         }
2562     }
2563
2564   scalar_single_iter_cost = vect_get_single_scalar_iteraion_cost (loop_vinfo);
2565
2566   /* Add additional cost for the peeled instructions in prologue and epilogue
2567      loop.
2568
2569      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2570      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2571
2572      TODO: Build an expression that represents peel_iters for prologue and
2573      epilogue to be used in a run-time test.  */
2574
2575   if (npeel  < 0)
2576     {
2577       peel_iters_prologue = vf/2;
2578       if (vect_print_dump_info (REPORT_COST))
2579         fprintf (vect_dump, "cost model: "
2580                  "prologue peel iters set to vf/2.");
2581
2582       /* If peeling for alignment is unknown, loop bound of main loop becomes
2583          unknown.  */
2584       peel_iters_epilogue = vf/2;
2585       if (vect_print_dump_info (REPORT_COST))
2586         fprintf (vect_dump, "cost model: "
2587                  "epilogue peel iters set to vf/2 because "
2588                  "peeling for alignment is unknown .");
2589
2590       /* If peeled iterations are unknown, count a taken branch and a not taken
2591          branch per peeled loop. Even if scalar loop iterations are known,
2592          vector iterations are not known since peeled prologue iterations are
2593          not known. Hence guards remain the same.  */
2594       peel_guard_costs +=  2 * (vect_get_cost (cond_branch_taken)
2595                                 + vect_get_cost (cond_branch_not_taken));
2596       vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2597                            + (peel_iters_epilogue * scalar_single_iter_cost)
2598                            + peel_guard_costs;
2599     }
2600   else
2601     {
2602       peel_iters_prologue = npeel;
2603       vec_outside_cost += vect_get_known_peeling_cost (loop_vinfo,
2604                                     peel_iters_prologue, &peel_iters_epilogue,
2605                                     scalar_single_iter_cost);
2606     }
2607
2608   /* FORNOW: The scalar outside cost is incremented in one of the
2609      following ways:
2610
2611      1. The vectorizer checks for alignment and aliasing and generates
2612      a condition that allows dynamic vectorization.  A cost model
2613      check is ANDED with the versioning condition.  Hence scalar code
2614      path now has the added cost of the versioning check.
2615
2616        if (cost > th & versioning_check)
2617          jmp to vector code
2618
2619      Hence run-time scalar is incremented by not-taken branch cost.
2620
2621      2. The vectorizer then checks if a prologue is required.  If the
2622      cost model check was not done before during versioning, it has to
2623      be done before the prologue check.
2624
2625        if (cost <= th)
2626          prologue = scalar_iters
2627        if (prologue == 0)
2628          jmp to vector code
2629        else
2630          execute prologue
2631        if (prologue == num_iters)
2632          go to exit
2633
2634      Hence the run-time scalar cost is incremented by a taken branch,
2635      plus a not-taken branch, plus a taken branch cost.
2636
2637      3. The vectorizer then checks if an epilogue is required.  If the
2638      cost model check was not done before during prologue check, it
2639      has to be done with the epilogue check.
2640
2641        if (prologue == 0)
2642          jmp to vector code
2643        else
2644          execute prologue
2645        if (prologue == num_iters)
2646          go to exit
2647        vector code:
2648          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2649            jmp to epilogue
2650
2651      Hence the run-time scalar cost should be incremented by 2 taken
2652      branches.
2653
2654      TODO: The back end may reorder the BBS's differently and reverse
2655      conditions/branch directions.  Change the estimates below to
2656      something more reasonable.  */
2657
2658   /* If the number of iterations is known and we do not do versioning, we can
2659      decide whether to vectorize at compile time.  Hence the scalar version
2660      do not carry cost model guard costs.  */
2661   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2662       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2663       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2664     {
2665       /* Cost model check occurs at versioning.  */
2666       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2667           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2668         scalar_outside_cost += vect_get_cost (cond_branch_not_taken);
2669       else
2670         {
2671           /* Cost model check occurs at prologue generation.  */
2672           if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2673             scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken)
2674                                    + vect_get_cost (cond_branch_not_taken);
2675           /* Cost model check occurs at epilogue generation.  */
2676           else
2677             scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken);
2678         }
2679     }
2680
2681   /* Add SLP costs.  */
2682   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2683   FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance)
2684     {
2685       vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2686       vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2687     }
2688
2689   /* Calculate number of iterations required to make the vector version
2690      profitable, relative to the loop bodies only.  The following condition
2691      must hold true:
2692      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2693      where
2694      SIC = scalar iteration cost, VIC = vector iteration cost,
2695      VOC = vector outside cost, VF = vectorization factor,
2696      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2697      SOC = scalar outside cost for run time cost model check.  */
2698
2699   if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2700     {
2701       if (vec_outside_cost <= 0)
2702         min_profitable_iters = 1;
2703       else
2704         {
2705           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2706                                   - vec_inside_cost * peel_iters_prologue
2707                                   - vec_inside_cost * peel_iters_epilogue)
2708                                  / ((scalar_single_iter_cost * vf)
2709                                     - vec_inside_cost);
2710
2711           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2712               <= ((vec_inside_cost * min_profitable_iters)
2713                   + ((vec_outside_cost - scalar_outside_cost) * vf)))
2714             min_profitable_iters++;
2715         }
2716     }
2717   /* vector version will never be profitable.  */
2718   else
2719     {
2720       if (vect_print_dump_info (REPORT_COST))
2721         fprintf (vect_dump, "cost model: the vector iteration cost = %d "
2722                  "divided by the scalar iteration cost = %d "
2723                  "is greater or equal to the vectorization factor = %d.",
2724                  vec_inside_cost, scalar_single_iter_cost, vf);
2725       return -1;
2726     }
2727
2728   if (vect_print_dump_info (REPORT_COST))
2729     {
2730       fprintf (vect_dump, "Cost model analysis: \n");
2731       fprintf (vect_dump, "  Vector inside of loop cost: %d\n",
2732                vec_inside_cost);
2733       fprintf (vect_dump, "  Vector outside of loop cost: %d\n",
2734                vec_outside_cost);
2735       fprintf (vect_dump, "  Scalar iteration cost: %d\n",
2736                scalar_single_iter_cost);
2737       fprintf (vect_dump, "  Scalar outside cost: %d\n", scalar_outside_cost);
2738       fprintf (vect_dump, "  prologue iterations: %d\n",
2739                peel_iters_prologue);
2740       fprintf (vect_dump, "  epilogue iterations: %d\n",
2741                peel_iters_epilogue);
2742       fprintf (vect_dump, "  Calculated minimum iters for profitability: %d\n",
2743                min_profitable_iters);
2744     }
2745
2746   min_profitable_iters =
2747         min_profitable_iters < vf ? vf : min_profitable_iters;
2748
2749   /* Because the condition we create is:
2750      if (niters <= min_profitable_iters)
2751        then skip the vectorized loop.  */
2752   min_profitable_iters--;
2753
2754   if (vect_print_dump_info (REPORT_COST))
2755     fprintf (vect_dump, "  Profitability threshold = %d\n",
2756              min_profitable_iters);
2757
2758   return min_profitable_iters;
2759 }
2760
2761
2762 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2763    functions. Design better to avoid maintenance issues.  */
2764
2765 /* Function vect_model_reduction_cost.
2766
2767    Models cost for a reduction operation, including the vector ops
2768    generated within the strip-mine loop, the initial definition before
2769    the loop, and the epilogue code that must be generated.  */
2770
2771 static bool
2772 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2773                            int ncopies)
2774 {
2775   int outer_cost = 0;
2776   enum tree_code code;
2777   optab optab;
2778   tree vectype;
2779   gimple stmt, orig_stmt;
2780   tree reduction_op;
2781   enum machine_mode mode;
2782   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2783   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2784
2785
2786   /* Cost of reduction op inside loop.  */
2787   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2788     += ncopies * vect_get_cost (vector_stmt);
2789
2790   stmt = STMT_VINFO_STMT (stmt_info);
2791
2792   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2793     {
2794     case GIMPLE_SINGLE_RHS:
2795       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2796       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2797       break;
2798     case GIMPLE_UNARY_RHS:
2799       reduction_op = gimple_assign_rhs1 (stmt);
2800       break;
2801     case GIMPLE_BINARY_RHS:
2802       reduction_op = gimple_assign_rhs2 (stmt);
2803       break;
2804     case GIMPLE_TERNARY_RHS:
2805       reduction_op = gimple_assign_rhs3 (stmt);
2806       break;
2807     default:
2808       gcc_unreachable ();
2809     }
2810
2811   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2812   if (!vectype)
2813     {
2814       if (vect_print_dump_info (REPORT_COST))
2815         {
2816           fprintf (vect_dump, "unsupported data-type ");
2817           print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2818         }
2819       return false;
2820    }
2821
2822   mode = TYPE_MODE (vectype);
2823   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2824
2825   if (!orig_stmt)
2826     orig_stmt = STMT_VINFO_STMT (stmt_info);
2827
2828   code = gimple_assign_rhs_code (orig_stmt);
2829
2830   /* Add in cost for initial definition.  */
2831   outer_cost += vect_get_cost (scalar_to_vec);
2832
2833   /* Determine cost of epilogue code.
2834
2835      We have a reduction operator that will reduce the vector in one statement.
2836      Also requires scalar extract.  */
2837
2838   if (!nested_in_vect_loop_p (loop, orig_stmt))
2839     {
2840       if (reduc_code != ERROR_MARK)
2841         outer_cost += vect_get_cost (vector_stmt)
2842                       + vect_get_cost (vec_to_scalar);
2843       else
2844         {
2845           int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2846           tree bitsize =
2847             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2848           int element_bitsize = tree_low_cst (bitsize, 1);
2849           int nelements = vec_size_in_bits / element_bitsize;
2850
2851           optab = optab_for_tree_code (code, vectype, optab_default);
2852
2853           /* We have a whole vector shift available.  */
2854           if (VECTOR_MODE_P (mode)
2855               && optab_handler (optab, mode) != CODE_FOR_nothing
2856               && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
2857             /* Final reduction via vector shifts and the reduction operator. Also
2858                requires scalar extract.  */
2859             outer_cost += ((exact_log2(nelements) * 2)
2860               * vect_get_cost (vector_stmt)
2861               + vect_get_cost (vec_to_scalar));
2862           else
2863             /* Use extracts and reduction op for final reduction.  For N elements,
2864                we have N extracts and N-1 reduction ops.  */
2865             outer_cost += ((nelements + nelements - 1)
2866               * vect_get_cost (vector_stmt));
2867         }
2868     }
2869
2870   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2871
2872   if (vect_print_dump_info (REPORT_COST))
2873     fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2874              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2875              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2876
2877   return true;
2878 }
2879
2880
2881 /* Function vect_model_induction_cost.
2882
2883    Models cost for induction operations.  */
2884
2885 static void
2886 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2887 {
2888   /* loop cost for vec_loop.  */
2889   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2890     = ncopies * vect_get_cost (vector_stmt);
2891   /* prologue cost for vec_init and vec_step.  */
2892   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)
2893     = 2 * vect_get_cost (scalar_to_vec);
2894
2895   if (vect_print_dump_info (REPORT_COST))
2896     fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2897              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2898              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2899 }
2900
2901
2902 /* Function get_initial_def_for_induction
2903
2904    Input:
2905    STMT - a stmt that performs an induction operation in the loop.
2906    IV_PHI - the initial value of the induction variable
2907
2908    Output:
2909    Return a vector variable, initialized with the first VF values of
2910    the induction variable.  E.g., for an iv with IV_PHI='X' and
2911    evolution S, for a vector of 4 units, we want to return:
2912    [X, X + S, X + 2*S, X + 3*S].  */
2913
2914 static tree
2915 get_initial_def_for_induction (gimple iv_phi)
2916 {
2917   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2918   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2919   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2920   tree scalar_type;
2921   tree vectype;
2922   int nunits;
2923   edge pe = loop_preheader_edge (loop);
2924   struct loop *iv_loop;
2925   basic_block new_bb;
2926   tree vec, vec_init, vec_step, t;
2927   tree access_fn;
2928   tree new_var;
2929   tree new_name;
2930   gimple init_stmt, induction_phi, new_stmt;
2931   tree induc_def, vec_def, vec_dest;
2932   tree init_expr, step_expr;
2933   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2934   int i;
2935   bool ok;
2936   int ncopies;
2937   tree expr;
2938   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2939   bool nested_in_vect_loop = false;
2940   gimple_seq stmts = NULL;
2941   imm_use_iterator imm_iter;
2942   use_operand_p use_p;
2943   gimple exit_phi;
2944   edge latch_e;
2945   tree loop_arg;
2946   gimple_stmt_iterator si;
2947   basic_block bb = gimple_bb (iv_phi);
2948   tree stepvectype;
2949   tree resvectype;
2950
2951   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
2952   if (nested_in_vect_loop_p (loop, iv_phi))
2953     {
2954       nested_in_vect_loop = true;
2955       iv_loop = loop->inner;
2956     }
2957   else
2958     iv_loop = loop;
2959   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2960
2961   latch_e = loop_latch_edge (iv_loop);
2962   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2963
2964   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2965   gcc_assert (access_fn);
2966   STRIP_NOPS (access_fn);
2967   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2968                                     &init_expr, &step_expr);
2969   gcc_assert (ok);
2970   pe = loop_preheader_edge (iv_loop);
2971
2972   scalar_type = TREE_TYPE (init_expr);
2973   vectype = get_vectype_for_scalar_type (scalar_type);
2974   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
2975   gcc_assert (vectype);
2976   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2977   ncopies = vf / nunits;
2978
2979   gcc_assert (phi_info);
2980   gcc_assert (ncopies >= 1);
2981
2982   /* Find the first insertion point in the BB.  */
2983   si = gsi_after_labels (bb);
2984
2985   /* Create the vector that holds the initial_value of the induction.  */
2986   if (nested_in_vect_loop)
2987     {
2988       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
2989          been created during vectorization of previous stmts.  We obtain it
2990          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
2991       tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2992                                            loop_preheader_edge (iv_loop));
2993       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2994     }
2995   else
2996     {
2997       /* iv_loop is the loop to be vectorized. Create:
2998          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
2999       new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
3000       add_referenced_var (new_var);
3001
3002       new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
3003       if (stmts)
3004         {
3005           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3006           gcc_assert (!new_bb);
3007         }
3008
3009       t = NULL_TREE;
3010       t = tree_cons (NULL_TREE, new_name, t);
3011       for (i = 1; i < nunits; i++)
3012         {
3013           /* Create: new_name_i = new_name + step_expr  */
3014           enum tree_code code = POINTER_TYPE_P (scalar_type)
3015                                 ? POINTER_PLUS_EXPR : PLUS_EXPR;
3016           init_stmt = gimple_build_assign_with_ops (code, new_var,
3017                                                     new_name, step_expr);
3018           new_name = make_ssa_name (new_var, init_stmt);
3019           gimple_assign_set_lhs (init_stmt, new_name);
3020
3021           new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3022           gcc_assert (!new_bb);
3023
3024           if (vect_print_dump_info (REPORT_DETAILS))
3025             {
3026               fprintf (vect_dump, "created new init_stmt: ");
3027               print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
3028             }
3029           t = tree_cons (NULL_TREE, new_name, t);
3030         }
3031       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3032       vec = build_constructor_from_list (vectype, nreverse (t));
3033       vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
3034     }
3035
3036
3037   /* Create the vector that holds the step of the induction.  */
3038   if (nested_in_vect_loop)
3039     /* iv_loop is nested in the loop to be vectorized. Generate:
3040        vec_step = [S, S, S, S]  */
3041     new_name = step_expr;
3042   else
3043     {
3044       /* iv_loop is the loop to be vectorized. Generate:
3045           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3046       expr = build_int_cst (TREE_TYPE (step_expr), vf);
3047       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3048                               expr, step_expr);
3049     }
3050
3051   t = unshare_expr (new_name);
3052   gcc_assert (CONSTANT_CLASS_P (new_name));
3053   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3054   gcc_assert (stepvectype);
3055   vec = build_vector_from_val (stepvectype, t);
3056   vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
3057
3058
3059   /* Create the following def-use cycle:
3060      loop prolog:
3061          vec_init = ...
3062          vec_step = ...
3063      loop:
3064          vec_iv = PHI <vec_init, vec_loop>
3065          ...
3066          STMT
3067          ...
3068          vec_loop = vec_iv + vec_step;  */
3069
3070   /* Create the induction-phi that defines the induction-operand.  */
3071   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3072   add_referenced_var (vec_dest);
3073   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3074   set_vinfo_for_stmt (induction_phi,
3075                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3076   induc_def = PHI_RESULT (induction_phi);
3077
3078   /* Create the iv update inside the loop  */
3079   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3080                                            induc_def, vec_step);
3081   vec_def = make_ssa_name (vec_dest, new_stmt);
3082   gimple_assign_set_lhs (new_stmt, vec_def);
3083   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3084   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3085                                                    NULL));
3086
3087   /* Set the arguments of the phi node:  */
3088   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3089   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3090                UNKNOWN_LOCATION);
3091
3092
3093   /* In case that vectorization factor (VF) is bigger than the number
3094      of elements that we can fit in a vectype (nunits), we have to generate
3095      more than one vector stmt - i.e - we need to "unroll" the
3096      vector stmt by a factor VF/nunits.  For more details see documentation
3097      in vectorizable_operation.  */
3098
3099   if (ncopies > 1)
3100     {
3101       stmt_vec_info prev_stmt_vinfo;
3102       /* FORNOW. This restriction should be relaxed.  */
3103       gcc_assert (!nested_in_vect_loop);
3104
3105       /* Create the vector that holds the step of the induction.  */
3106       expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3107       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3108                               expr, step_expr);
3109       t = unshare_expr (new_name);
3110       gcc_assert (CONSTANT_CLASS_P (new_name));
3111       vec = build_vector_from_val (stepvectype, t);
3112       vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
3113
3114       vec_def = induc_def;
3115       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3116       for (i = 1; i < ncopies; i++)
3117         {
3118           /* vec_i = vec_prev + vec_step  */
3119           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3120                                                    vec_def, vec_step);
3121           vec_def = make_ssa_name (vec_dest, new_stmt);
3122           gimple_assign_set_lhs (new_stmt, vec_def);
3123
3124           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3125           if (!useless_type_conversion_p (resvectype, vectype))
3126             {
3127               new_stmt = gimple_build_assign_with_ops
3128                   (VIEW_CONVERT_EXPR,
3129                    vect_get_new_vect_var (resvectype, vect_simple_var,
3130                                           "vec_iv_"),
3131                    build1 (VIEW_CONVERT_EXPR, resvectype,
3132                            gimple_assign_lhs (new_stmt)), NULL_TREE);
3133               gimple_assign_set_lhs (new_stmt,
3134                                      make_ssa_name
3135                                        (gimple_assign_lhs (new_stmt), new_stmt));
3136               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3137             }
3138           set_vinfo_for_stmt (new_stmt,
3139                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3140           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3141           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3142         }
3143     }
3144
3145   if (nested_in_vect_loop)
3146     {
3147       /* Find the loop-closed exit-phi of the induction, and record
3148          the final vector of induction results:  */
3149       exit_phi = NULL;
3150       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3151         {
3152           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3153             {
3154               exit_phi = USE_STMT (use_p);
3155               break;
3156             }
3157         }
3158       if (exit_phi)
3159         {
3160           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3161           /* FORNOW. Currently not supporting the case that an inner-loop induction
3162              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3163           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3164                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3165
3166           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3167           if (vect_print_dump_info (REPORT_DETAILS))
3168             {
3169               fprintf (vect_dump, "vector of inductions after inner-loop:");
3170               print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
3171             }
3172         }
3173     }
3174
3175
3176   if (vect_print_dump_info (REPORT_DETAILS))
3177     {
3178       fprintf (vect_dump, "transform induction: created def-use cycle: ");
3179       print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
3180       fprintf (vect_dump, "\n");
3181       print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
3182     }
3183
3184   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3185   if (!useless_type_conversion_p (resvectype, vectype))
3186     {
3187       new_stmt = gimple_build_assign_with_ops
3188          (VIEW_CONVERT_EXPR,
3189           vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3190           build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3191       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3192       gimple_assign_set_lhs (new_stmt, induc_def);
3193       si = gsi_start_bb (bb);
3194       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3195       set_vinfo_for_stmt (new_stmt,
3196                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3197       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3198         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3199     }
3200
3201   return induc_def;
3202 }
3203
3204
3205 /* Function get_initial_def_for_reduction
3206
3207    Input:
3208    STMT - a stmt that performs a reduction operation in the loop.
3209    INIT_VAL - the initial value of the reduction variable
3210
3211    Output:
3212    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3213         of the reduction (used for adjusting the epilog - see below).
3214    Return a vector variable, initialized according to the operation that STMT
3215         performs. This vector will be used as the initial value of the
3216         vector of partial results.
3217
3218    Option1 (adjust in epilog): Initialize the vector as follows:
3219      add/bit or/xor:    [0,0,...,0,0]
3220      mult/bit and:      [1,1,...,1,1]
3221      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3222    and when necessary (e.g. add/mult case) let the caller know
3223    that it needs to adjust the result by init_val.
3224
3225    Option2: Initialize the vector as follows:
3226      add/bit or/xor:    [init_val,0,0,...,0]
3227      mult/bit and:      [init_val,1,1,...,1]
3228      min/max/cond_expr: [init_val,init_val,...,init_val]
3229    and no adjustments are needed.
3230
3231    For example, for the following code:
3232
3233    s = init_val;
3234    for (i=0;i<n;i++)
3235      s = s + a[i];
3236
3237    STMT is 's = s + a[i]', and the reduction variable is 's'.
3238    For a vector of 4 units, we want to return either [0,0,0,init_val],
3239    or [0,0,0,0] and let the caller know that it needs to adjust
3240    the result at the end by 'init_val'.
3241
3242    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3243    initialization vector is simpler (same element in all entries), if
3244    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3245
3246    A cost model should help decide between these two schemes.  */
3247
3248 tree
3249 get_initial_def_for_reduction (gimple stmt, tree init_val,
3250                                tree *adjustment_def)
3251 {
3252   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3253   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3254   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3255   tree scalar_type = TREE_TYPE (init_val);
3256   tree vectype = get_vectype_for_scalar_type (scalar_type);
3257   int nunits;
3258   enum tree_code code = gimple_assign_rhs_code (stmt);
3259   tree def_for_init;
3260   tree init_def;
3261   tree t = NULL_TREE;
3262   int i;
3263   bool nested_in_vect_loop = false;
3264   tree init_value;
3265   REAL_VALUE_TYPE real_init_val = dconst0;
3266   int int_init_val = 0;
3267   gimple def_stmt = NULL;
3268
3269   gcc_assert (vectype);
3270   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3271
3272   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3273               || SCALAR_FLOAT_TYPE_P (scalar_type));
3274
3275   if (nested_in_vect_loop_p (loop, stmt))
3276     nested_in_vect_loop = true;
3277   else
3278     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3279
3280   /* In case of double reduction we only create a vector variable to be put
3281      in the reduction phi node.  The actual statement creation is done in
3282      vect_create_epilog_for_reduction.  */
3283   if (adjustment_def && nested_in_vect_loop
3284       && TREE_CODE (init_val) == SSA_NAME
3285       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3286       && gimple_code (def_stmt) == GIMPLE_PHI
3287       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3288       && vinfo_for_stmt (def_stmt)
3289       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3290           == vect_double_reduction_def)
3291     {
3292       *adjustment_def = NULL;
3293       return vect_create_destination_var (init_val, vectype);
3294     }
3295
3296   if (TREE_CONSTANT (init_val))
3297     {
3298       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3299         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3300       else
3301         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3302     }
3303   else
3304     init_value = init_val;
3305
3306   switch (code)
3307     {
3308       case WIDEN_SUM_EXPR:
3309       case DOT_PROD_EXPR:
3310       case PLUS_EXPR:
3311       case MINUS_EXPR:
3312       case BIT_IOR_EXPR:
3313       case BIT_XOR_EXPR:
3314       case MULT_EXPR:
3315       case BIT_AND_EXPR:
3316         /* ADJUSMENT_DEF is NULL when called from
3317            vect_create_epilog_for_reduction to vectorize double reduction.  */
3318         if (adjustment_def)
3319           {
3320             if (nested_in_vect_loop)
3321               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3322                                                               NULL);
3323             else
3324               *adjustment_def = init_val;
3325           }
3326
3327         if (code == MULT_EXPR)
3328           {
3329             real_init_val = dconst1;
3330             int_init_val = 1;
3331           }
3332
3333         if (code == BIT_AND_EXPR)
3334           int_init_val = -1;
3335
3336         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3337           def_for_init = build_real (scalar_type, real_init_val);
3338         else
3339           def_for_init = build_int_cst (scalar_type, int_init_val);
3340
3341         /* Create a vector of '0' or '1' except the first element.  */
3342         for (i = nunits - 2; i >= 0; --i)
3343           t = tree_cons (NULL_TREE, def_for_init, t);
3344
3345         /* Option1: the first element is '0' or '1' as well.  */
3346         if (adjustment_def)
3347           {
3348             t = tree_cons (NULL_TREE, def_for_init, t);
3349             init_def = build_vector (vectype, t);
3350             break;
3351           }
3352
3353         /* Option2: the first element is INIT_VAL.  */
3354         t = tree_cons (NULL_TREE, init_value, t);
3355         if (TREE_CONSTANT (init_val))
3356           init_def = build_vector (vectype, t);
3357         else
3358           init_def = build_constructor_from_list (vectype, t);
3359
3360         break;
3361
3362       case MIN_EXPR:
3363       case MAX_EXPR:
3364       case COND_EXPR:
3365         if (adjustment_def)
3366           {
3367             *adjustment_def = NULL_TREE;
3368             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3369             break;
3370           }
3371
3372         init_def = build_vector_from_val (vectype, init_value);
3373         break;
3374
3375       default:
3376         gcc_unreachable ();
3377     }
3378
3379   return init_def;
3380 }
3381
3382
3383 /* Function vect_create_epilog_for_reduction
3384
3385    Create code at the loop-epilog to finalize the result of a reduction
3386    computation.
3387
3388    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3389      reduction statements.
3390    STMT is the scalar reduction stmt that is being vectorized.
3391    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3392      number of elements that we can fit in a vectype (nunits).  In this case
3393      we have to generate more than one vector stmt - i.e - we need to "unroll"
3394      the vector stmt by a factor VF/nunits.  For more details see documentation
3395      in vectorizable_operation.
3396    REDUC_CODE is the tree-code for the epilog reduction.
3397    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3398      computation.
3399    REDUC_INDEX is the index of the operand in the right hand side of the
3400      statement that is defined by REDUCTION_PHI.
3401    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3402    SLP_NODE is an SLP node containing a group of reduction statements. The
3403      first one in this group is STMT.
3404
3405    This function:
3406    1. Creates the reduction def-use cycles: sets the arguments for
3407       REDUCTION_PHIS:
3408       The loop-entry argument is the vectorized initial-value of the reduction.
3409       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3410       sums.
3411    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3412       by applying the operation specified by REDUC_CODE if available, or by
3413       other means (whole-vector shifts or a scalar loop).
3414       The function also creates a new phi node at the loop exit to preserve
3415       loop-closed form, as illustrated below.
3416
3417      The flow at the entry to this function:
3418
3419         loop:
3420           vec_def = phi <null, null>            # REDUCTION_PHI
3421           VECT_DEF = vector_stmt                # vectorized form of STMT
3422           s_loop = scalar_stmt                  # (scalar) STMT
3423         loop_exit:
3424           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3425           use <s_out0>
3426           use <s_out0>
3427
3428      The above is transformed by this function into:
3429
3430         loop:
3431           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3432           VECT_DEF = vector_stmt                # vectorized form of STMT
3433           s_loop = scalar_stmt                  # (scalar) STMT
3434         loop_exit:
3435           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3436           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3437           v_out2 = reduce <v_out1>
3438           s_out3 = extract_field <v_out2, 0>
3439           s_out4 = adjust_result <s_out3>
3440           use <s_out4>
3441           use <s_out4>
3442 */
3443
3444 static void
3445 vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
3446                                   int ncopies, enum tree_code reduc_code,
3447                                   VEC (gimple, heap) *reduction_phis,
3448                                   int reduc_index, bool double_reduc,
3449                                   slp_tree slp_node)
3450 {
3451   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3452   stmt_vec_info prev_phi_info;
3453   tree vectype;
3454   enum machine_mode mode;
3455   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3456   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3457   basic_block exit_bb;
3458   tree scalar_dest;
3459   tree scalar_type;
3460   gimple new_phi = NULL, phi;
3461   gimple_stmt_iterator exit_gsi;
3462   tree vec_dest;
3463   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3464   gimple epilog_stmt = NULL;
3465   enum tree_code code = gimple_assign_rhs_code (stmt);
3466   gimple exit_phi;
3467   tree bitsize, bitpos;
3468   tree adjustment_def = NULL;
3469   tree vec_initial_def = NULL;
3470   tree reduction_op, expr, def;
3471   tree orig_name, scalar_result;
3472   imm_use_iterator imm_iter, phi_imm_iter;
3473   use_operand_p use_p, phi_use_p;
3474   bool extract_scalar_result = false;
3475   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3476   bool nested_in_vect_loop = false;
3477   VEC (gimple, heap) *new_phis = NULL;
3478   enum vect_def_type dt = vect_unknown_def_type;
3479   int j, i;
3480   VEC (tree, heap) *scalar_results = NULL;
3481   unsigned int group_size = 1, k, ratio;
3482   VEC (tree, heap) *vec_initial_defs = NULL;
3483   VEC (gimple, heap) *phis;
3484   bool slp_reduc = false;
3485   tree new_phi_result;
3486
3487   if (slp_node)
3488     group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
3489
3490   if (nested_in_vect_loop_p (loop, stmt))
3491     {
3492       outer_loop = loop;
3493       loop = loop->inner;
3494       nested_in_vect_loop = true;
3495       gcc_assert (!slp_node);
3496     }
3497
3498   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3499     {
3500     case GIMPLE_SINGLE_RHS:
3501       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3502                   == ternary_op);
3503       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3504       break;
3505     case GIMPLE_UNARY_RHS:
3506       reduction_op = gimple_assign_rhs1 (stmt);
3507       break;
3508     case GIMPLE_BINARY_RHS:
3509       reduction_op = reduc_index ?
3510                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3511       break;
3512     case GIMPLE_TERNARY_RHS:
3513       reduction_op = gimple_op (stmt, reduc_index + 1);
3514       break;
3515     default:
3516       gcc_unreachable ();
3517     }
3518
3519   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3520   gcc_assert (vectype);
3521   mode = TYPE_MODE (vectype);
3522
3523   /* 1. Create the reduction def-use cycle:
3524      Set the arguments of REDUCTION_PHIS, i.e., transform
3525
3526         loop:
3527           vec_def = phi <null, null>            # REDUCTION_PHI
3528           VECT_DEF = vector_stmt                # vectorized form of STMT
3529           ...
3530
3531      into:
3532
3533         loop:
3534           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3535           VECT_DEF = vector_stmt                # vectorized form of STMT
3536           ...
3537
3538      (in case of SLP, do it for all the phis). */
3539
3540   /* Get the loop-entry arguments.  */
3541   if (slp_node)
3542     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3543                        NULL, slp_node, reduc_index);
3544   else
3545     {
3546       vec_initial_defs = VEC_alloc (tree, heap, 1);
3547      /* For the case of reduction, vect_get_vec_def_for_operand returns
3548         the scalar def before the loop, that defines the initial value
3549         of the reduction variable.  */
3550       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3551                                                       &adjustment_def);
3552       VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
3553     }
3554
3555   /* Set phi nodes arguments.  */
3556   FOR_EACH_VEC_ELT (gimple, reduction_phis, i, phi)
3557     {
3558       tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
3559       tree def = VEC_index (tree, vect_defs, i);
3560       for (j = 0; j < ncopies; j++)
3561         {
3562           /* Set the loop-entry arg of the reduction-phi.  */
3563           add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3564                        UNKNOWN_LOCATION);
3565
3566           /* Set the loop-latch arg for the reduction-phi.  */
3567           if (j > 0)
3568             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3569
3570           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3571
3572           if (vect_print_dump_info (REPORT_DETAILS))
3573             {
3574               fprintf (vect_dump, "transform reduction: created def-use"
3575                                   " cycle: ");
3576               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
3577               fprintf (vect_dump, "\n");
3578               print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0,
3579                                  TDF_SLIM);
3580             }
3581
3582           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3583         }
3584     }
3585
3586   VEC_free (tree, heap, vec_initial_defs);
3587
3588   /* 2. Create epilog code.
3589         The reduction epilog code operates across the elements of the vector
3590         of partial results computed by the vectorized loop.
3591         The reduction epilog code consists of:
3592
3593         step 1: compute the scalar result in a vector (v_out2)
3594         step 2: extract the scalar result (s_out3) from the vector (v_out2)
3595         step 3: adjust the scalar result (s_out3) if needed.
3596
3597         Step 1 can be accomplished using one the following three schemes:
3598           (scheme 1) using reduc_code, if available.
3599           (scheme 2) using whole-vector shifts, if available.
3600           (scheme 3) using a scalar loop. In this case steps 1+2 above are
3601                      combined.
3602
3603           The overall epilog code looks like this:
3604
3605           s_out0 = phi <s_loop>         # original EXIT_PHI
3606           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
3607           v_out2 = reduce <v_out1>              # step 1
3608           s_out3 = extract_field <v_out2, 0>    # step 2
3609           s_out4 = adjust_result <s_out3>       # step 3
3610
3611           (step 3 is optional, and steps 1 and 2 may be combined).
3612           Lastly, the uses of s_out0 are replaced by s_out4.  */
3613
3614
3615   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3616          v_out1 = phi <VECT_DEF>
3617          Store them in NEW_PHIS.  */
3618
3619   exit_bb = single_exit (loop)->dest;
3620   prev_phi_info = NULL;
3621   new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3622   FOR_EACH_VEC_ELT (tree, vect_defs, i, def)
3623     {
3624       for (j = 0; j < ncopies; j++)
3625         {
3626           phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
3627           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3628           if (j == 0)
3629             VEC_quick_push (gimple, new_phis, phi);
3630           else
3631             {
3632               def = vect_get_vec_def_for_stmt_copy (dt, def);
3633               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3634             }
3635
3636           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3637           prev_phi_info = vinfo_for_stmt (phi);
3638         }
3639     }
3640
3641   /* The epilogue is created for the outer-loop, i.e., for the loop being
3642      vectorized.  */
3643   if (double_reduc)
3644     {
3645       loop = outer_loop;
3646       exit_bb = single_exit (loop)->dest;
3647     }
3648
3649   exit_gsi = gsi_after_labels (exit_bb);
3650
3651   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3652          (i.e. when reduc_code is not available) and in the final adjustment
3653          code (if needed).  Also get the original scalar reduction variable as
3654          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
3655          represents a reduction pattern), the tree-code and scalar-def are
3656          taken from the original stmt that the pattern-stmt (STMT) replaces.
3657          Otherwise (it is a regular reduction) - the tree-code and scalar-def
3658          are taken from STMT.  */
3659
3660   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3661   if (!orig_stmt)
3662     {
3663       /* Regular reduction  */
3664       orig_stmt = stmt;
3665     }
3666   else
3667     {
3668       /* Reduction pattern  */
3669       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3670       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3671       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3672     }
3673
3674   code = gimple_assign_rhs_code (orig_stmt);
3675   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3676      partial results are added and not subtracted.  */
3677   if (code == MINUS_EXPR)
3678     code = PLUS_EXPR;
3679
3680   scalar_dest = gimple_assign_lhs (orig_stmt);
3681   scalar_type = TREE_TYPE (scalar_dest);
3682   scalar_results = VEC_alloc (tree, heap, group_size);
3683   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3684   bitsize = TYPE_SIZE (scalar_type);
3685
3686   /* In case this is a reduction in an inner-loop while vectorizing an outer
3687      loop - we don't need to extract a single scalar result at the end of the
3688      inner-loop (unless it is double reduction, i.e., the use of reduction is
3689      outside the outer-loop).  The final vector of partial results will be used
3690      in the vectorized outer-loop, or reduced to a scalar result at the end of
3691      the outer-loop.  */
3692   if (nested_in_vect_loop && !double_reduc)
3693     goto vect_finalize_reduction;
3694
3695   /* SLP reduction without reduction chain, e.g.,
3696      # a1 = phi <a2, a0>
3697      # b1 = phi <b2, b0>
3698      a2 = operation (a1)
3699      b2 = operation (b1)  */
3700   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
3701
3702   /* In case of reduction chain, e.g.,
3703      # a1 = phi <a3, a0>
3704      a2 = operation (a1)
3705      a3 = operation (a2),
3706
3707      we may end up with more than one vector result.  Here we reduce them to
3708      one vector.  */
3709   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
3710     {
3711       tree first_vect = PHI_RESULT (VEC_index (gimple, new_phis, 0));
3712       tree tmp;
3713       gimple new_vec_stmt = NULL;
3714
3715       vec_dest = vect_create_destination_var (scalar_dest, vectype);
3716       for (k = 1; k < VEC_length (gimple, new_phis); k++)
3717         {
3718           gimple next_phi = VEC_index (gimple, new_phis, k);
3719           tree second_vect = PHI_RESULT (next_phi);
3720
3721           tmp = build2 (code, vectype,  first_vect, second_vect);
3722           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
3723           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
3724           gimple_assign_set_lhs (new_vec_stmt, first_vect);
3725           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
3726         }
3727
3728       new_phi_result = first_vect;
3729       if (new_vec_stmt)
3730         {
3731           VEC_truncate (gimple, new_phis, 0);
3732           VEC_safe_push (gimple, heap, new_phis, new_vec_stmt);
3733         }
3734     }
3735   else
3736     new_phi_result = PHI_RESULT (VEC_index (gimple, new_phis, 0));
3737
3738   /* 2.3 Create the reduction code, using one of the three schemes described
3739          above. In SLP we simply need to extract all the elements from the
3740          vector (without reducing them), so we use scalar shifts.  */
3741   if (reduc_code != ERROR_MARK && !slp_reduc)
3742     {
3743       tree tmp;
3744
3745       /*** Case 1:  Create:
3746            v_out2 = reduc_expr <v_out1>  */
3747
3748       if (vect_print_dump_info (REPORT_DETAILS))
3749         fprintf (vect_dump, "Reduce using direct vector reduction.");
3750
3751       vec_dest = vect_create_destination_var (scalar_dest, vectype);
3752       tmp = build1 (reduc_code, vectype, new_phi_result);
3753       epilog_stmt = gimple_build_assign (vec_dest, tmp);
3754       new_temp = make_ssa_name (vec_dest, epilog_stmt);
3755       gimple_assign_set_lhs (epilog_stmt, new_temp);
3756       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3757
3758       extract_scalar_result = true;
3759     }
3760   else
3761     {
3762       enum tree_code shift_code = ERROR_MARK;
3763       bool have_whole_vector_shift = true;
3764       int bit_offset;
3765       int element_bitsize = tree_low_cst (bitsize, 1);
3766       int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3767       tree vec_temp;
3768
3769       if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3770         shift_code = VEC_RSHIFT_EXPR;
3771       else
3772         have_whole_vector_shift = false;
3773
3774       /* Regardless of whether we have a whole vector shift, if we're
3775          emulating the operation via tree-vect-generic, we don't want
3776          to use it.  Only the first round of the reduction is likely
3777          to still be profitable via emulation.  */
3778       /* ??? It might be better to emit a reduction tree code here, so that
3779          tree-vect-generic can expand the first round via bit tricks.  */
3780       if (!VECTOR_MODE_P (mode))
3781         have_whole_vector_shift = false;
3782       else
3783         {
3784           optab optab = optab_for_tree_code (code, vectype, optab_default);
3785           if (optab_handler (optab, mode) == CODE_FOR_nothing)
3786             have_whole_vector_shift = false;
3787         }
3788
3789       if (have_whole_vector_shift && !slp_reduc)
3790         {
3791           /*** Case 2: Create:
3792              for (offset = VS/2; offset >= element_size; offset/=2)
3793                 {
3794                   Create:  va' = vec_shift <va, offset>
3795                   Create:  va = vop <va, va'>
3796                 }  */
3797
3798           if (vect_print_dump_info (REPORT_DETAILS))
3799             fprintf (vect_dump, "Reduce using vector shifts");
3800
3801           vec_dest = vect_create_destination_var (scalar_dest, vectype);
3802           new_temp = new_phi_result;
3803           for (bit_offset = vec_size_in_bits/2;
3804                bit_offset >= element_bitsize;
3805                bit_offset /= 2)
3806             {
3807               tree bitpos = size_int (bit_offset);
3808
3809               epilog_stmt = gimple_build_assign_with_ops (shift_code,
3810                                                vec_dest, new_temp, bitpos);
3811               new_name = make_ssa_name (vec_dest, epilog_stmt);
3812               gimple_assign_set_lhs (epilog_stmt, new_name);
3813               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3814
3815               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3816                                                           new_name, new_temp);
3817               new_temp = make_ssa_name (vec_dest, epilog_stmt);
3818               gimple_assign_set_lhs (epilog_stmt, new_temp);
3819               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3820             }
3821
3822           extract_scalar_result = true;
3823         }
3824       else
3825         {
3826           tree rhs;
3827
3828           /*** Case 3: Create:
3829              s = extract_field <v_out2, 0>
3830              for (offset = element_size;
3831                   offset < vector_size;
3832                   offset += element_size;)
3833                {
3834                  Create:  s' = extract_field <v_out2, offset>
3835                  Create:  s = op <s, s'>  // For non SLP cases
3836                }  */
3837
3838           if (vect_print_dump_info (REPORT_DETAILS))
3839             fprintf (vect_dump, "Reduce using scalar code. ");
3840
3841           vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3842           FOR_EACH_VEC_ELT (gimple, new_phis, i, new_phi)
3843             {
3844               if (gimple_code (new_phi) == GIMPLE_PHI)
3845                 vec_temp = PHI_RESULT (new_phi);
3846               else
3847                 vec_temp = gimple_assign_lhs (new_phi);
3848               rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3849                             bitsize_zero_node);
3850               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3851               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3852               gimple_assign_set_lhs (epilog_stmt, new_temp);
3853               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3854
3855               /* In SLP we don't need to apply reduction operation, so we just
3856                  collect s' values in SCALAR_RESULTS.  */
3857               if (slp_reduc)
3858                 VEC_safe_push (tree, heap, scalar_results, new_temp);
3859
3860               for (bit_offset = element_bitsize;
3861                    bit_offset < vec_size_in_bits;
3862                    bit_offset += element_bitsize)
3863                 {
3864                   tree bitpos = bitsize_int (bit_offset);
3865                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
3866                                      bitsize, bitpos);
3867
3868                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3869                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3870                   gimple_assign_set_lhs (epilog_stmt, new_name);
3871                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3872
3873                   if (slp_reduc)
3874                     {
3875                       /* In SLP we don't need to apply reduction operation, so
3876                          we just collect s' values in SCALAR_RESULTS.  */
3877                       new_temp = new_name;
3878                       VEC_safe_push (tree, heap, scalar_results, new_name);
3879                     }
3880                   else
3881                     {
3882                       epilog_stmt = gimple_build_assign_with_ops (code,
3883                                           new_scalar_dest, new_name, new_temp);
3884                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3885                       gimple_assign_set_lhs (epilog_stmt, new_temp);
3886                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3887                     }
3888                 }
3889             }
3890
3891           /* The only case where we need to reduce scalar results in SLP, is
3892              unrolling.  If the size of SCALAR_RESULTS is greater than
3893              GROUP_SIZE, we reduce them combining elements modulo
3894              GROUP_SIZE.  */
3895           if (slp_reduc)
3896             {
3897               tree res, first_res, new_res;
3898               gimple new_stmt;
3899
3900               /* Reduce multiple scalar results in case of SLP unrolling.  */
3901               for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
3902                    j++)
3903                 {
3904                   first_res = VEC_index (tree, scalar_results, j % group_size);
3905                   new_stmt = gimple_build_assign_with_ops (code,
3906                                               new_scalar_dest, first_res, res);
3907                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
3908                   gimple_assign_set_lhs (new_stmt, new_res);
3909                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
3910                   VEC_replace (tree, scalar_results, j % group_size, new_res);
3911                 }
3912             }
3913           else
3914             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
3915             VEC_safe_push (tree, heap, scalar_results, new_temp);
3916
3917           extract_scalar_result = false;
3918         }
3919     }
3920
3921   /* 2.4  Extract the final scalar result.  Create:
3922           s_out3 = extract_field <v_out2, bitpos>  */
3923
3924   if (extract_scalar_result)
3925     {
3926       tree rhs;
3927
3928       if (vect_print_dump_info (REPORT_DETAILS))
3929         fprintf (vect_dump, "extract scalar result");
3930
3931       if (BYTES_BIG_ENDIAN)
3932         bitpos = size_binop (MULT_EXPR,
3933                              bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3934                              TYPE_SIZE (scalar_type));
3935       else
3936         bitpos = bitsize_zero_node;
3937
3938       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3939       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3940       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3941       gimple_assign_set_lhs (epilog_stmt, new_temp);
3942       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3943       VEC_safe_push (tree, heap, scalar_results, new_temp);
3944     }
3945
3946 vect_finalize_reduction:
3947
3948   if (double_reduc)
3949     loop = loop->inner;
3950
3951   /* 2.5 Adjust the final result by the initial value of the reduction
3952          variable. (When such adjustment is not needed, then
3953          'adjustment_def' is zero).  For example, if code is PLUS we create:
3954          new_temp = loop_exit_def + adjustment_def  */
3955
3956   if (adjustment_def)
3957     {
3958       gcc_assert (!slp_reduc);
3959       if (nested_in_vect_loop)
3960         {
3961           new_phi = VEC_index (gimple, new_phis, 0);
3962           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3963           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3964           new_dest = vect_create_destination_var (scalar_dest, vectype);
3965         }
3966       else
3967         {
3968           new_temp = VEC_index (tree, scalar_results, 0);
3969           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3970           expr = build2 (code, scalar_type, new_temp, adjustment_def);
3971           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3972         }
3973
3974       epilog_stmt = gimple_build_assign (new_dest, expr);
3975       new_temp = make_ssa_name (new_dest, epilog_stmt);
3976       gimple_assign_set_lhs (epilog_stmt, new_temp);
3977       SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3978       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3979       if (nested_in_vect_loop)
3980         {
3981           set_vinfo_for_stmt (epilog_stmt,
3982                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
3983                                                  NULL));
3984           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3985                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3986
3987           if (!double_reduc)
3988             VEC_quick_push (tree, scalar_results, new_temp);
3989           else
3990             VEC_replace (tree, scalar_results, 0, new_temp);
3991         }
3992       else
3993         VEC_replace (tree, scalar_results, 0, new_temp);
3994
3995       VEC_replace (gimple, new_phis, 0, epilog_stmt);
3996     }
3997
3998   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
3999           phis with new adjusted scalar results, i.e., replace use <s_out0>
4000           with use <s_out4>.
4001
4002      Transform:
4003         loop_exit:
4004           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4005           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4006           v_out2 = reduce <v_out1>
4007           s_out3 = extract_field <v_out2, 0>
4008           s_out4 = adjust_result <s_out3>
4009           use <s_out0>
4010           use <s_out0>
4011
4012      into:
4013
4014         loop_exit:
4015           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4016           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4017           v_out2 = reduce <v_out1>
4018           s_out3 = extract_field <v_out2, 0>
4019           s_out4 = adjust_result <s_out3>
4020           use <s_out4>
4021           use <s_out4> */
4022
4023
4024   /* In SLP reduction chain we reduce vector results into one vector if
4025      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4026      the last stmt in the reduction chain, since we are looking for the loop
4027      exit phi node.  */
4028   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4029     {
4030       scalar_dest = gimple_assign_lhs (VEC_index (gimple,
4031                                        SLP_TREE_SCALAR_STMTS (slp_node),
4032                                        group_size - 1));
4033       group_size = 1;
4034     }
4035
4036   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4037      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4038      need to match SCALAR_RESULTS with corresponding statements.  The first
4039      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4040      the first vector stmt, etc.
4041      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4042   if (group_size > VEC_length (gimple, new_phis))
4043     {
4044       ratio = group_size / VEC_length (gimple, new_phis);
4045       gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
4046     }
4047   else
4048     ratio = 1;
4049
4050   for (k = 0; k < group_size; k++)
4051     {
4052       if (k % ratio == 0)
4053         {
4054           epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
4055           reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
4056         }
4057
4058       if (slp_reduc)
4059         {
4060           gimple current_stmt = VEC_index (gimple,
4061                                        SLP_TREE_SCALAR_STMTS (slp_node), k);
4062
4063           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4064           /* SLP statements can't participate in patterns.  */
4065           gcc_assert (!orig_stmt);
4066           scalar_dest = gimple_assign_lhs (current_stmt);
4067         }
4068
4069       phis = VEC_alloc (gimple, heap, 3);
4070       /* Find the loop-closed-use at the loop exit of the original scalar
4071          result.  (The reduction result is expected to have two immediate uses -
4072          one at the latch block, and one at the loop exit).  */
4073       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4074         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4075           VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
4076
4077       /* We expect to have found an exit_phi because of loop-closed-ssa
4078          form.  */
4079       gcc_assert (!VEC_empty (gimple, phis));
4080
4081       FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
4082         {
4083           if (outer_loop)
4084             {
4085               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4086               gimple vect_phi;
4087
4088               /* FORNOW. Currently not supporting the case that an inner-loop
4089                  reduction is not used in the outer-loop (but only outside the
4090                  outer-loop), unless it is double reduction.  */
4091               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4092                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4093                           || double_reduc);
4094
4095               STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4096               if (!double_reduc
4097                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4098                       != vect_double_reduction_def)
4099                 continue;
4100
4101               /* Handle double reduction:
4102
4103                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4104                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4105                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4106                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4107
4108                  At that point the regular reduction (stmt2 and stmt3) is
4109                  already vectorized, as well as the exit phi node, stmt4.
4110                  Here we vectorize the phi node of double reduction, stmt1, and
4111                  update all relevant statements.  */
4112
4113               /* Go through all the uses of s2 to find double reduction phi
4114                  node, i.e., stmt1 above.  */
4115               orig_name = PHI_RESULT (exit_phi);
4116               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4117                 {
4118                   stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4119                   stmt_vec_info new_phi_vinfo;
4120                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4121                   basic_block bb = gimple_bb (use_stmt);
4122                   gimple use;
4123
4124                   /* Check that USE_STMT is really double reduction phi
4125                      node.  */
4126                   if (gimple_code (use_stmt) != GIMPLE_PHI
4127                       || gimple_phi_num_args (use_stmt) != 2
4128                       || !use_stmt_vinfo
4129                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4130                           != vect_double_reduction_def
4131                       || bb->loop_father != outer_loop)
4132                     continue;
4133
4134                   /* Create vector phi node for double reduction:
4135                      vs1 = phi <vs0, vs2>
4136                      vs1 was created previously in this function by a call to
4137                        vect_get_vec_def_for_operand and is stored in
4138                        vec_initial_def;
4139                      vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
4140                      vs0 is created here.  */
4141
4142                   /* Create vector phi node.  */
4143                   vect_phi = create_phi_node (vec_initial_def, bb);
4144                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4145                                     loop_vec_info_for_loop (outer_loop), NULL);
4146                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4147
4148                   /* Create vs0 - initial def of the double reduction phi.  */
4149                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4150                                              loop_preheader_edge (outer_loop));
4151                   init_def = get_initial_def_for_reduction (stmt,
4152                                                           preheader_arg, NULL);
4153                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4154                                                     vectype, NULL);
4155
4156                   /* Update phi node arguments with vs0 and vs2.  */
4157                   add_phi_arg (vect_phi, vect_phi_init,
4158                                loop_preheader_edge (outer_loop),
4159                                UNKNOWN_LOCATION);
4160                   add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
4161                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4162                   if (vect_print_dump_info (REPORT_DETAILS))
4163                     {
4164                       fprintf (vect_dump, "created double reduction phi "
4165                                           "node: ");
4166                       print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
4167                     }
4168
4169                   vect_phi_res = PHI_RESULT (vect_phi);
4170
4171                   /* Replace the use, i.e., set the correct vs1 in the regular
4172                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4173                      loop is redundant.  */
4174                   use = reduction_phi;
4175                   for (j = 0; j < ncopies; j++)
4176                     {
4177                       edge pr_edge = loop_preheader_edge (loop);
4178                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4179                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4180                     }
4181                 }
4182             }
4183         }
4184
4185       VEC_free (gimple, heap, phis);
4186       if (nested_in_vect_loop)
4187         {
4188           if (double_reduc)
4189             loop = outer_loop;
4190           else
4191             continue;
4192         }
4193
4194       phis = VEC_alloc (gimple, heap, 3);
4195       /* Find the loop-closed-use at the loop exit of the original scalar
4196          result.  (The reduction result is expected to have two immediate uses,
4197          one at the latch block, and one at the loop exit).  For double
4198          reductions we are looking for exit phis of the outer loop.  */
4199       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4200         {
4201           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4202             VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
4203           else
4204             {
4205               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4206                 {
4207                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4208
4209                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4210                     {
4211                       if (!flow_bb_inside_loop_p (loop,
4212                                              gimple_bb (USE_STMT (phi_use_p))))
4213                         VEC_safe_push (gimple, heap, phis,
4214                                        USE_STMT (phi_use_p));
4215                     }
4216                 }
4217             }
4218         }
4219
4220       FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
4221         {
4222           /* Replace the uses:  */
4223           orig_name = PHI_RESULT (exit_phi);
4224           scalar_result = VEC_index (tree, scalar_results, k);
4225           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4226             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4227               SET_USE (use_p, scalar_result);
4228         }
4229
4230       VEC_free (gimple, heap, phis);
4231     }
4232
4233   VEC_free (tree, heap, scalar_results);
4234   VEC_free (gimple, heap, new_phis);
4235 }
4236
4237
4238 /* Function vectorizable_reduction.
4239
4240    Check if STMT performs a reduction operation that can be vectorized.
4241    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4242    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4243    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4244
4245    This function also handles reduction idioms (patterns) that have been
4246    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4247    of this form:
4248      X = pattern_expr (arg0, arg1, ..., X)
4249    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4250    sequence that had been detected and replaced by the pattern-stmt (STMT).
4251
4252    In some cases of reduction patterns, the type of the reduction variable X is
4253    different than the type of the other arguments of STMT.
4254    In such cases, the vectype that is used when transforming STMT into a vector
4255    stmt is different than the vectype that is used to determine the
4256    vectorization factor, because it consists of a different number of elements
4257    than the actual number of elements that are being operated upon in parallel.
4258
4259    For example, consider an accumulation of shorts into an int accumulator.
4260    On some targets it's possible to vectorize this pattern operating on 8
4261    shorts at a time (hence, the vectype for purposes of determining the
4262    vectorization factor should be V8HI); on the other hand, the vectype that
4263    is used to create the vector form is actually V4SI (the type of the result).
4264
4265    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4266    indicates what is the actual level of parallelism (V8HI in the example), so
4267    that the right vectorization factor would be derived.  This vectype
4268    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4269    be used to create the vectorized stmt.  The right vectype for the vectorized
4270    stmt is obtained from the type of the result X:
4271         get_vectype_for_scalar_type (TREE_TYPE (X))
4272
4273    This means that, contrary to "regular" reductions (or "regular" stmts in
4274    general), the following equation:
4275       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4276    does *NOT* necessarily hold for reduction patterns.  */
4277
4278 bool
4279 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4280                         gimple *vec_stmt, slp_tree slp_node)
4281 {
4282   tree vec_dest;
4283   tree scalar_dest;
4284   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4285   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4286   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4287   tree vectype_in = NULL_TREE;
4288   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4289   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4290   enum tree_code code, orig_code, epilog_reduc_code;
4291   enum machine_mode vec_mode;
4292   int op_type;
4293   optab optab, reduc_optab;
4294   tree new_temp = NULL_TREE;
4295   tree def;
4296   gimple def_stmt;
4297   enum vect_def_type dt;
4298   gimple new_phi = NULL;
4299   tree scalar_type;
4300   bool is_simple_use;
4301   gimple orig_stmt;
4302   stmt_vec_info orig_stmt_info;
4303   tree expr = NULL_TREE;
4304   int i;
4305   int ncopies;
4306   int epilog_copies;
4307   stmt_vec_info prev_stmt_info, prev_phi_info;
4308   bool single_defuse_cycle = false;
4309   tree reduc_def = NULL_TREE;
4310   gimple new_stmt = NULL;
4311   int j;
4312   tree ops[3];
4313   bool nested_cycle = false, found_nested_cycle_def = false;
4314   gimple reduc_def_stmt = NULL;
4315   /* The default is that the reduction variable is the last in statement.  */
4316   int reduc_index = 2;
4317   bool double_reduc = false, dummy;
4318   basic_block def_bb;
4319   struct loop * def_stmt_loop, *outer_loop = NULL;
4320   tree def_arg;
4321   gimple def_arg_stmt;
4322   VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
4323   VEC (gimple, heap) *phis = NULL;
4324   int vec_num;
4325   tree def0, def1, tem, op0, op1 = NULL_TREE;
4326
4327   /* In case of reduction chain we switch to the first stmt in the chain, but
4328      we don't update STMT_INFO, since only the last stmt is marked as reduction
4329      and has reduction properties.  */
4330   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4331     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4332
4333   if (nested_in_vect_loop_p (loop, stmt))
4334     {
4335       outer_loop = loop;
4336       loop = loop->inner;
4337       nested_cycle = true;
4338     }
4339
4340   /* 1. Is vectorizable reduction?  */
4341   /* Not supportable if the reduction variable is used in the loop, unless
4342      it's a reduction chain.  */
4343   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4344       && !GROUP_FIRST_ELEMENT (stmt_info))
4345     return false;
4346
4347   /* Reductions that are not used even in an enclosing outer-loop,
4348      are expected to be "live" (used out of the loop).  */
4349   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4350       && !STMT_VINFO_LIVE_P (stmt_info))
4351     return false;
4352
4353   /* Make sure it was already recognized as a reduction computation.  */
4354   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4355       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4356     return false;
4357
4358   /* 2. Has this been recognized as a reduction pattern?
4359
4360      Check if STMT represents a pattern that has been recognized
4361      in earlier analysis stages.  For stmts that represent a pattern,
4362      the STMT_VINFO_RELATED_STMT field records the last stmt in
4363      the original sequence that constitutes the pattern.  */
4364
4365   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4366   if (orig_stmt)
4367     {
4368       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4369       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
4370       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4371       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4372     }
4373
4374   /* 3. Check the operands of the operation.  The first operands are defined
4375         inside the loop body. The last operand is the reduction variable,
4376         which is defined by the loop-header-phi.  */
4377
4378   gcc_assert (is_gimple_assign (stmt));
4379
4380   /* Flatten RHS.  */
4381   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4382     {
4383     case GIMPLE_SINGLE_RHS:
4384       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4385       if (op_type == ternary_op)
4386         {
4387           tree rhs = gimple_assign_rhs1 (stmt);
4388           ops[0] = TREE_OPERAND (rhs, 0);
4389           ops[1] = TREE_OPERAND (rhs, 1);
4390           ops[2] = TREE_OPERAND (rhs, 2);
4391           code = TREE_CODE (rhs);
4392         }
4393       else
4394         return false;
4395       break;
4396
4397     case GIMPLE_BINARY_RHS:
4398       code = gimple_assign_rhs_code (stmt);
4399       op_type = TREE_CODE_LENGTH (code);
4400       gcc_assert (op_type == binary_op);
4401       ops[0] = gimple_assign_rhs1 (stmt);
4402       ops[1] = gimple_assign_rhs2 (stmt);
4403       break;
4404
4405     case GIMPLE_TERNARY_RHS:
4406       code = gimple_assign_rhs_code (stmt);
4407       op_type = TREE_CODE_LENGTH (code);
4408       gcc_assert (op_type == ternary_op);
4409       ops[0] = gimple_assign_rhs1 (stmt);
4410       ops[1] = gimple_assign_rhs2 (stmt);
4411       ops[2] = gimple_assign_rhs3 (stmt);
4412       break;
4413
4414     case GIMPLE_UNARY_RHS:
4415       return false;
4416
4417     default:
4418       gcc_unreachable ();
4419     }
4420
4421   if (code == COND_EXPR && slp_node)
4422     return false;
4423
4424   scalar_dest = gimple_assign_lhs (stmt);
4425   scalar_type = TREE_TYPE (scalar_dest);
4426   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4427       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4428     return false;
4429
4430   /* Do not try to vectorize bit-precision reductions.  */
4431   if ((TYPE_PRECISION (scalar_type)
4432        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4433     return false;
4434
4435   /* All uses but the last are expected to be defined in the loop.
4436      The last use is the reduction variable.  In case of nested cycle this
4437      assumption is not true: we use reduc_index to record the index of the
4438      reduction variable.  */
4439   for (i = 0; i < op_type-1; i++)
4440     {
4441       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4442       if (i == 0 && code == COND_EXPR)
4443         continue;
4444
4445       is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL,
4446                                             &def_stmt, &def, &dt, &tem);
4447       if (!vectype_in)
4448         vectype_in = tem;
4449       gcc_assert (is_simple_use);
4450
4451       if (dt != vect_internal_def
4452           && dt != vect_external_def
4453           && dt != vect_constant_def
4454           && dt != vect_induction_def
4455           && !(dt == vect_nested_cycle && nested_cycle))
4456         return false;
4457
4458       if (dt == vect_nested_cycle)
4459         {
4460           found_nested_cycle_def = true;
4461           reduc_def_stmt = def_stmt;
4462           reduc_index = i;
4463         }
4464     }
4465
4466   is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL, &def_stmt,
4467                                         &def, &dt, &tem);
4468   if (!vectype_in)
4469     vectype_in = tem;
4470   gcc_assert (is_simple_use);
4471   gcc_assert (dt == vect_reduction_def
4472               || dt == vect_nested_cycle
4473               || ((dt == vect_internal_def || dt == vect_external_def
4474                    || dt == vect_constant_def || dt == vect_induction_def)
4475                    && nested_cycle && found_nested_cycle_def));
4476   if (!found_nested_cycle_def)
4477     reduc_def_stmt = def_stmt;
4478
4479   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4480   if (orig_stmt)
4481     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4482                                                        reduc_def_stmt,
4483                                                        !nested_cycle,
4484                                                        &dummy));
4485   else
4486     {
4487       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4488                                              !nested_cycle, &dummy);
4489       /* We changed STMT to be the first stmt in reduction chain, hence we
4490          check that in this case the first element in the chain is STMT.  */
4491       gcc_assert (stmt == tmp
4492                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4493     }
4494
4495   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4496     return false;
4497
4498   if (slp_node || PURE_SLP_STMT (stmt_info))
4499     ncopies = 1;
4500   else
4501     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4502                / TYPE_VECTOR_SUBPARTS (vectype_in));
4503
4504   gcc_assert (ncopies >= 1);
4505
4506   vec_mode = TYPE_MODE (vectype_in);
4507
4508   if (code == COND_EXPR)
4509     {
4510       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4511         {
4512           if (vect_print_dump_info (REPORT_DETAILS))
4513             fprintf (vect_dump, "unsupported condition in reduction");
4514
4515             return false;
4516         }
4517     }
4518   else
4519     {
4520       /* 4. Supportable by target?  */
4521
4522       /* 4.1. check support for the operation in the loop  */
4523       optab = optab_for_tree_code (code, vectype_in, optab_default);
4524       if (!optab)
4525         {
4526           if (vect_print_dump_info (REPORT_DETAILS))
4527             fprintf (vect_dump, "no optab.");
4528
4529           return false;
4530         }
4531
4532       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4533         {
4534           if (vect_print_dump_info (REPORT_DETAILS))
4535             fprintf (vect_dump, "op not supported by target.");
4536
4537           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4538               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4539                   < vect_min_worthwhile_factor (code))
4540             return false;
4541
4542           if (vect_print_dump_info (REPORT_DETAILS))
4543             fprintf (vect_dump, "proceeding using word mode.");
4544         }
4545
4546       /* Worthwhile without SIMD support?  */
4547       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4548           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4549              < vect_min_worthwhile_factor (code))
4550         {
4551           if (vect_print_dump_info (REPORT_DETAILS))
4552             fprintf (vect_dump, "not worthwhile without SIMD support.");
4553
4554           return false;
4555         }
4556     }
4557
4558   /* 4.2. Check support for the epilog operation.
4559
4560           If STMT represents a reduction pattern, then the type of the
4561           reduction variable may be different than the type of the rest
4562           of the arguments.  For example, consider the case of accumulation
4563           of shorts into an int accumulator; The original code:
4564                         S1: int_a = (int) short_a;
4565           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
4566
4567           was replaced with:
4568                         STMT: int_acc = widen_sum <short_a, int_acc>
4569
4570           This means that:
4571           1. The tree-code that is used to create the vector operation in the
4572              epilog code (that reduces the partial results) is not the
4573              tree-code of STMT, but is rather the tree-code of the original
4574              stmt from the pattern that STMT is replacing.  I.e, in the example
4575              above we want to use 'widen_sum' in the loop, but 'plus' in the
4576              epilog.
4577           2. The type (mode) we use to check available target support
4578              for the vector operation to be created in the *epilog*, is
4579              determined by the type of the reduction variable (in the example
4580              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4581              However the type (mode) we use to check available target support
4582              for the vector operation to be created *inside the loop*, is
4583              determined by the type of the other arguments to STMT (in the
4584              example we'd check this: optab_handler (widen_sum_optab,
4585              vect_short_mode)).
4586
4587           This is contrary to "regular" reductions, in which the types of all
4588           the arguments are the same as the type of the reduction variable.
4589           For "regular" reductions we can therefore use the same vector type
4590           (and also the same tree-code) when generating the epilog code and
4591           when generating the code inside the loop.  */
4592
4593   if (orig_stmt)
4594     {
4595       /* This is a reduction pattern: get the vectype from the type of the
4596          reduction variable, and get the tree-code from orig_stmt.  */
4597       orig_code = gimple_assign_rhs_code (orig_stmt);
4598       gcc_assert (vectype_out);
4599       vec_mode = TYPE_MODE (vectype_out);
4600     }
4601   else
4602     {
4603       /* Regular reduction: use the same vectype and tree-code as used for
4604          the vector code inside the loop can be used for the epilog code. */
4605       orig_code = code;
4606     }
4607
4608   if (nested_cycle)
4609     {
4610       def_bb = gimple_bb (reduc_def_stmt);
4611       def_stmt_loop = def_bb->loop_father;
4612       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4613                                        loop_preheader_edge (def_stmt_loop));
4614       if (TREE_CODE (def_arg) == SSA_NAME
4615           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4616           && gimple_code (def_arg_stmt) == GIMPLE_PHI
4617           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4618           && vinfo_for_stmt (def_arg_stmt)
4619           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4620               == vect_double_reduction_def)
4621         double_reduc = true;
4622     }
4623
4624   epilog_reduc_code = ERROR_MARK;
4625   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4626     {
4627       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4628                                          optab_default);
4629       if (!reduc_optab)
4630         {
4631           if (vect_print_dump_info (REPORT_DETAILS))
4632             fprintf (vect_dump, "no optab for reduction.");
4633
4634           epilog_reduc_code = ERROR_MARK;
4635         }
4636
4637       if (reduc_optab
4638           && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4639         {
4640           if (vect_print_dump_info (REPORT_DETAILS))
4641             fprintf (vect_dump, "reduc op not supported by target.");
4642
4643           epilog_reduc_code = ERROR_MARK;
4644         }
4645     }
4646   else
4647     {
4648       if (!nested_cycle || double_reduc)
4649         {
4650           if (vect_print_dump_info (REPORT_DETAILS))
4651             fprintf (vect_dump, "no reduc code for scalar code.");
4652
4653           return false;
4654         }
4655     }
4656
4657   if (double_reduc && ncopies > 1)
4658     {
4659       if (vect_print_dump_info (REPORT_DETAILS))
4660         fprintf (vect_dump, "multiple types in double reduction");
4661
4662       return false;
4663     }
4664
4665   /* In case of widenning multiplication by a constant, we update the type
4666      of the constant to be the type of the other operand.  We check that the
4667      constant fits the type in the pattern recognition pass.  */
4668   if (code == DOT_PROD_EXPR
4669       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
4670     {
4671       if (TREE_CODE (ops[0]) == INTEGER_CST)
4672         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
4673       else if (TREE_CODE (ops[1]) == INTEGER_CST)
4674         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
4675       else
4676         {
4677           if (vect_print_dump_info (REPORT_DETAILS))
4678             fprintf (vect_dump, "invalid types in dot-prod");
4679
4680           return false;
4681         }
4682     }
4683
4684   if (!vec_stmt) /* transformation not required.  */
4685     {
4686       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4687         return false;
4688       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4689       return true;
4690     }
4691
4692   /** Transform.  **/
4693
4694   if (vect_print_dump_info (REPORT_DETAILS))
4695     fprintf (vect_dump, "transform reduction.");
4696
4697   /* FORNOW: Multiple types are not supported for condition.  */
4698   if (code == COND_EXPR)
4699     gcc_assert (ncopies == 1);
4700
4701   /* Create the destination vector  */
4702   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4703
4704   /* In case the vectorization factor (VF) is bigger than the number
4705      of elements that we can fit in a vectype (nunits), we have to generate
4706      more than one vector stmt - i.e - we need to "unroll" the
4707      vector stmt by a factor VF/nunits.  For more details see documentation
4708      in vectorizable_operation.  */
4709
4710   /* If the reduction is used in an outer loop we need to generate
4711      VF intermediate results, like so (e.g. for ncopies=2):
4712         r0 = phi (init, r0)
4713         r1 = phi (init, r1)
4714         r0 = x0 + r0;
4715         r1 = x1 + r1;
4716     (i.e. we generate VF results in 2 registers).
4717     In this case we have a separate def-use cycle for each copy, and therefore
4718     for each copy we get the vector def for the reduction variable from the
4719     respective phi node created for this copy.
4720
4721     Otherwise (the reduction is unused in the loop nest), we can combine
4722     together intermediate results, like so (e.g. for ncopies=2):
4723         r = phi (init, r)
4724         r = x0 + r;
4725         r = x1 + r;
4726    (i.e. we generate VF/2 results in a single register).
4727    In this case for each copy we get the vector def for the reduction variable
4728    from the vectorized reduction operation generated in the previous iteration.
4729   */
4730
4731   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
4732     {
4733       single_defuse_cycle = true;
4734       epilog_copies = 1;
4735     }
4736   else
4737     epilog_copies = ncopies;
4738
4739   prev_stmt_info = NULL;
4740   prev_phi_info = NULL;
4741   if (slp_node)
4742     {
4743       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4744       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
4745                   == TYPE_VECTOR_SUBPARTS (vectype_in));
4746     }
4747   else
4748     {
4749       vec_num = 1;
4750       vec_oprnds0 = VEC_alloc (tree, heap, 1);
4751       if (op_type == ternary_op)
4752         vec_oprnds1 = VEC_alloc (tree, heap, 1);
4753     }
4754
4755   phis = VEC_alloc (gimple, heap, vec_num);
4756   vect_defs = VEC_alloc (tree, heap, vec_num);
4757   if (!slp_node)
4758     VEC_quick_push (tree, vect_defs, NULL_TREE);
4759
4760   for (j = 0; j < ncopies; j++)
4761     {
4762       if (j == 0 || !single_defuse_cycle)
4763         {
4764           for (i = 0; i < vec_num; i++)
4765             {
4766               /* Create the reduction-phi that defines the reduction
4767                  operand.  */
4768               new_phi = create_phi_node (vec_dest, loop->header);
4769               set_vinfo_for_stmt (new_phi,
4770                                   new_stmt_vec_info (new_phi, loop_vinfo,
4771                                                      NULL));
4772                if (j == 0 || slp_node)
4773                  VEC_quick_push (gimple, phis, new_phi);
4774             }
4775         }
4776
4777       if (code == COND_EXPR)
4778         {
4779           gcc_assert (!slp_node);
4780           vectorizable_condition (stmt, gsi, vec_stmt,
4781                                   PHI_RESULT (VEC_index (gimple, phis, 0)),
4782                                   reduc_index, NULL);
4783           /* Multiple types are not supported for condition.  */
4784           break;
4785         }
4786
4787       /* Handle uses.  */
4788       if (j == 0)
4789         {
4790           op0 = ops[!reduc_index];
4791           if (op_type == ternary_op)
4792             {
4793               if (reduc_index == 0)
4794                 op1 = ops[2];
4795               else
4796                 op1 = ops[1];
4797             }
4798
4799           if (slp_node)
4800             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
4801                                slp_node, -1);
4802           else
4803             {
4804               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
4805                                                             stmt, NULL);
4806               VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
4807               if (op_type == ternary_op)
4808                {
4809                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
4810                                                                NULL);
4811                  VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
4812                }
4813             }
4814         }
4815       else
4816         {
4817           if (!slp_node)
4818             {
4819               enum vect_def_type dt;
4820               gimple dummy_stmt;
4821               tree dummy;
4822
4823               vect_is_simple_use (ops[!reduc_index], loop_vinfo, NULL,
4824                                   &dummy_stmt, &dummy, &dt);
4825               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
4826                                                               loop_vec_def0);
4827               VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
4828               if (op_type == ternary_op)
4829                 {
4830                   vect_is_simple_use (op1, loop_vinfo, NULL, &dummy_stmt,
4831                                       &dummy, &dt);
4832                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
4833                                                                 loop_vec_def1);
4834                   VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
4835                 }
4836             }
4837
4838           if (single_defuse_cycle)
4839             reduc_def = gimple_assign_lhs (new_stmt);
4840
4841           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
4842         }
4843
4844       FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, def0)
4845         {
4846           if (slp_node)
4847             reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
4848           else
4849             {
4850               if (!single_defuse_cycle || j == 0)
4851                 reduc_def = PHI_RESULT (new_phi);
4852             }
4853
4854           def1 = ((op_type == ternary_op)
4855                   ? VEC_index (tree, vec_oprnds1, i) : NULL);
4856           if (op_type == binary_op)
4857             {
4858               if (reduc_index == 0)
4859                 expr = build2 (code, vectype_out, reduc_def, def0);
4860               else
4861                 expr = build2 (code, vectype_out, def0, reduc_def);
4862             }
4863           else
4864             {
4865               if (reduc_index == 0)
4866                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
4867               else
4868                 {
4869                   if (reduc_index == 1)
4870                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
4871                   else
4872                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
4873                 }
4874             }
4875
4876           new_stmt = gimple_build_assign (vec_dest, expr);
4877           new_temp = make_ssa_name (vec_dest, new_stmt);
4878           gimple_assign_set_lhs (new_stmt, new_temp);
4879           vect_finish_stmt_generation (stmt, new_stmt, gsi);
4880
4881           if (slp_node)
4882             {
4883               VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4884               VEC_quick_push (tree, vect_defs, new_temp);
4885             }
4886           else
4887             VEC_replace (tree, vect_defs, 0, new_temp);
4888         }
4889
4890       if (slp_node)
4891         continue;
4892
4893       if (j == 0)
4894         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4895       else
4896         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4897
4898       prev_stmt_info = vinfo_for_stmt (new_stmt);
4899       prev_phi_info = vinfo_for_stmt (new_phi);
4900     }
4901
4902   /* Finalize the reduction-phi (set its arguments) and create the
4903      epilog reduction code.  */
4904   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
4905     {
4906       new_temp = gimple_assign_lhs (*vec_stmt);
4907       VEC_replace (tree, vect_defs, 0, new_temp);
4908     }
4909
4910   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
4911                                     epilog_reduc_code, phis, reduc_index,
4912                                     double_reduc, slp_node);
4913
4914   VEC_free (gimple, heap, phis);
4915   VEC_free (tree, heap, vec_oprnds0);
4916   if (vec_oprnds1)
4917     VEC_free (tree, heap, vec_oprnds1);
4918
4919   return true;
4920 }
4921
4922 /* Function vect_min_worthwhile_factor.
4923
4924    For a loop where we could vectorize the operation indicated by CODE,
4925    return the minimum vectorization factor that makes it worthwhile
4926    to use generic vectors.  */
4927 int
4928 vect_min_worthwhile_factor (enum tree_code code)
4929 {
4930   switch (code)
4931     {
4932     case PLUS_EXPR:
4933     case MINUS_EXPR:
4934     case NEGATE_EXPR:
4935       return 4;
4936
4937     case BIT_AND_EXPR:
4938     case BIT_IOR_EXPR:
4939     case BIT_XOR_EXPR:
4940     case BIT_NOT_EXPR:
4941       return 2;
4942
4943     default:
4944       return INT_MAX;
4945     }
4946 }
4947
4948
4949 /* Function vectorizable_induction
4950
4951    Check if PHI performs an induction computation that can be vectorized.
4952    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
4953    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
4954    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
4955
4956 bool
4957 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4958                         gimple *vec_stmt)
4959 {
4960   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
4961   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4962   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4963   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4964   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4965   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4966   tree vec_def;
4967
4968   gcc_assert (ncopies >= 1);
4969   /* FORNOW. This restriction should be relaxed.  */
4970   if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
4971     {
4972       if (vect_print_dump_info (REPORT_DETAILS))
4973         fprintf (vect_dump, "multiple types in nested loop.");
4974       return false;
4975     }
4976
4977   if (!STMT_VINFO_RELEVANT_P (stmt_info))
4978     return false;
4979
4980   /* FORNOW: SLP not supported.  */
4981   if (STMT_SLP_TYPE (stmt_info))
4982     return false;
4983
4984   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
4985
4986   if (gimple_code (phi) != GIMPLE_PHI)
4987     return false;
4988
4989   if (!vec_stmt) /* transformation not required.  */
4990     {
4991       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
4992       if (vect_print_dump_info (REPORT_DETAILS))
4993         fprintf (vect_dump, "=== vectorizable_induction ===");
4994       vect_model_induction_cost (stmt_info, ncopies);
4995       return true;
4996     }
4997
4998   /** Transform.  **/
4999
5000   if (vect_print_dump_info (REPORT_DETAILS))
5001     fprintf (vect_dump, "transform induction phi.");
5002
5003   vec_def = get_initial_def_for_induction (phi);
5004   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5005   return true;
5006 }
5007
5008 /* Function vectorizable_live_operation.
5009
5010    STMT computes a value that is used outside the loop.  Check if
5011    it can be supported.  */
5012
5013 bool
5014 vectorizable_live_operation (gimple stmt,
5015                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5016                              gimple *vec_stmt ATTRIBUTE_UNUSED)
5017 {
5018   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5019   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5020   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5021   int i;
5022   int op_type;
5023   tree op;
5024   tree def;
5025   gimple def_stmt;
5026   enum vect_def_type dt;
5027   enum tree_code code;
5028   enum gimple_rhs_class rhs_class;
5029
5030   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5031
5032   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5033     return false;
5034
5035   if (!is_gimple_assign (stmt))
5036     return false;
5037
5038   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5039     return false;
5040
5041   /* FORNOW. CHECKME. */
5042   if (nested_in_vect_loop_p (loop, stmt))
5043     return false;
5044
5045   code = gimple_assign_rhs_code (stmt);
5046   op_type = TREE_CODE_LENGTH (code);
5047   rhs_class = get_gimple_rhs_class (code);
5048   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5049   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5050
5051   /* FORNOW: support only if all uses are invariant.  This means
5052      that the scalar operations can remain in place, unvectorized.
5053      The original last scalar value that they compute will be used.  */
5054
5055   for (i = 0; i < op_type; i++)
5056     {
5057       if (rhs_class == GIMPLE_SINGLE_RHS)
5058         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5059       else
5060         op = gimple_op (stmt, i + 1);
5061       if (op
5062           && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
5063         {
5064           if (vect_print_dump_info (REPORT_DETAILS))
5065             fprintf (vect_dump, "use not simple.");
5066           return false;
5067         }
5068
5069       if (dt != vect_external_def && dt != vect_constant_def)
5070         return false;
5071     }
5072
5073   /* No transformation is required for the cases we currently support.  */
5074   return true;
5075 }
5076
5077 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5078
5079 static void
5080 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5081 {
5082   ssa_op_iter op_iter;
5083   imm_use_iterator imm_iter;
5084   def_operand_p def_p;
5085   gimple ustmt;
5086
5087   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5088     {
5089       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5090         {
5091           basic_block bb;
5092
5093           if (!is_gimple_debug (ustmt))
5094             continue;
5095
5096           bb = gimple_bb (ustmt);
5097
5098           if (!flow_bb_inside_loop_p (loop, bb))
5099             {
5100               if (gimple_debug_bind_p (ustmt))
5101                 {
5102                   if (vect_print_dump_info (REPORT_DETAILS))
5103                     fprintf (vect_dump, "killing debug use");
5104
5105                   gimple_debug_bind_reset_value (ustmt);
5106                   update_stmt (ustmt);
5107                 }
5108               else
5109                 gcc_unreachable ();
5110             }
5111         }
5112     }
5113 }
5114
5115 /* Function vect_transform_loop.
5116
5117    The analysis phase has determined that the loop is vectorizable.
5118    Vectorize the loop - created vectorized stmts to replace the scalar
5119    stmts in the loop, and update the loop exit condition.  */
5120
5121 void
5122 vect_transform_loop (loop_vec_info loop_vinfo)
5123 {
5124   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5125   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5126   int nbbs = loop->num_nodes;
5127   gimple_stmt_iterator si;
5128   int i;
5129   tree ratio = NULL;
5130   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5131   bool strided_store;
5132   bool slp_scheduled = false;
5133   unsigned int nunits;
5134   tree cond_expr = NULL_TREE;
5135   gimple_seq cond_expr_stmt_list = NULL;
5136   bool do_peeling_for_loop_bound;
5137   gimple stmt, pattern_stmt, pattern_def_stmt;
5138   bool transform_pattern_stmt = false, pattern_def = false;
5139
5140   if (vect_print_dump_info (REPORT_DETAILS))
5141     fprintf (vect_dump, "=== vec_transform_loop ===");
5142
5143   /* Peel the loop if there are data refs with unknown alignment.
5144      Only one data ref with unknown store is allowed.  */
5145
5146   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5147     vect_do_peeling_for_alignment (loop_vinfo);
5148
5149   do_peeling_for_loop_bound
5150     = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5151        || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5152            && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
5153        || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
5154
5155   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5156       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5157     vect_loop_versioning (loop_vinfo,
5158                           !do_peeling_for_loop_bound,
5159                           &cond_expr, &cond_expr_stmt_list);
5160
5161   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5162      compile time constant), or it is a constant that doesn't divide by the
5163      vectorization factor, then an epilog loop needs to be created.
5164      We therefore duplicate the loop: the original loop will be vectorized,
5165      and will compute the first (n/VF) iterations.  The second copy of the loop
5166      will remain scalar and will compute the remaining (n%VF) iterations.
5167      (VF is the vectorization factor).  */
5168
5169   if (do_peeling_for_loop_bound)
5170     vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
5171                                     cond_expr, cond_expr_stmt_list);
5172   else
5173     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5174                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5175
5176   /* 1) Make sure the loop header has exactly two entries
5177      2) Make sure we have a preheader basic block.  */
5178
5179   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5180
5181   split_edge (loop_preheader_edge (loop));
5182
5183   /* FORNOW: the vectorizer supports only loops which body consist
5184      of one basic block (header + empty latch). When the vectorizer will
5185      support more involved loop forms, the order by which the BBs are
5186      traversed need to be reconsidered.  */
5187
5188   for (i = 0; i < nbbs; i++)
5189     {
5190       basic_block bb = bbs[i];
5191       stmt_vec_info stmt_info;
5192       gimple phi;
5193
5194       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5195         {
5196           phi = gsi_stmt (si);
5197           if (vect_print_dump_info (REPORT_DETAILS))
5198             {
5199               fprintf (vect_dump, "------>vectorizing phi: ");
5200               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
5201             }
5202           stmt_info = vinfo_for_stmt (phi);
5203           if (!stmt_info)
5204             continue;
5205
5206           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5207             vect_loop_kill_debug_uses (loop, phi);
5208
5209           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5210               && !STMT_VINFO_LIVE_P (stmt_info))
5211             continue;
5212
5213           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5214                 != (unsigned HOST_WIDE_INT) vectorization_factor)
5215               && vect_print_dump_info (REPORT_DETAILS))
5216             fprintf (vect_dump, "multiple-types.");
5217
5218           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5219             {
5220               if (vect_print_dump_info (REPORT_DETAILS))
5221                 fprintf (vect_dump, "transform phi.");
5222               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5223             }
5224         }
5225
5226       pattern_stmt = NULL;
5227       for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5228         {
5229           bool is_store;
5230
5231           if (transform_pattern_stmt)
5232             {
5233               stmt = pattern_stmt;
5234               transform_pattern_stmt = false;
5235             }
5236           else
5237             stmt = gsi_stmt (si);
5238
5239           if (vect_print_dump_info (REPORT_DETAILS))
5240             {
5241               fprintf (vect_dump, "------>vectorizing statement: ");
5242               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
5243             }
5244
5245           stmt_info = vinfo_for_stmt (stmt);
5246
5247           /* vector stmts created in the outer-loop during vectorization of
5248              stmts in an inner-loop may not have a stmt_info, and do not
5249              need to be vectorized.  */
5250           if (!stmt_info)
5251             {
5252               gsi_next (&si);
5253               continue;
5254             }
5255
5256           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5257             vect_loop_kill_debug_uses (loop, stmt);
5258
5259           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5260               && !STMT_VINFO_LIVE_P (stmt_info))
5261             {
5262               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5263                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5264                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5265                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5266                 {
5267                   stmt = pattern_stmt;
5268                   stmt_info = vinfo_for_stmt (stmt);
5269                 }
5270               else
5271                 {
5272                   gsi_next (&si);
5273                   continue;
5274                 }
5275             }
5276           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5277                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5278                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5279                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5280             transform_pattern_stmt = true;
5281
5282           /* If pattern statement has a def stmt, vectorize it too.  */
5283           if (is_pattern_stmt_p (stmt_info)
5284               && (pattern_def_stmt = STMT_VINFO_PATTERN_DEF_STMT (stmt_info))
5285               && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_def_stmt))
5286                   || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_def_stmt))))
5287             {
5288               if (pattern_def)
5289                 pattern_def = false;
5290               else
5291                 {
5292                   if (vect_print_dump_info (REPORT_DETAILS))
5293                     {
5294                       fprintf (vect_dump, "==> vectorizing pattern def"
5295                                           " stmt: ");
5296                       print_gimple_stmt (vect_dump, pattern_def_stmt, 0,
5297                                          TDF_SLIM);
5298                     }
5299
5300                   pattern_def = true;
5301                   stmt = pattern_def_stmt;
5302                   stmt_info = vinfo_for_stmt (stmt);
5303                 }
5304             }
5305
5306           gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5307           nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
5308                                                STMT_VINFO_VECTYPE (stmt_info));
5309           if (!STMT_SLP_TYPE (stmt_info)
5310               && nunits != (unsigned int) vectorization_factor
5311               && vect_print_dump_info (REPORT_DETAILS))
5312             /* For SLP VF is set according to unrolling factor, and not to
5313                vector size, hence for SLP this print is not valid.  */
5314             fprintf (vect_dump, "multiple-types.");
5315
5316           /* SLP. Schedule all the SLP instances when the first SLP stmt is
5317              reached.  */
5318           if (STMT_SLP_TYPE (stmt_info))
5319             {
5320               if (!slp_scheduled)
5321                 {
5322                   slp_scheduled = true;
5323
5324                   if (vect_print_dump_info (REPORT_DETAILS))
5325                     fprintf (vect_dump, "=== scheduling SLP instances ===");
5326
5327                   vect_schedule_slp (loop_vinfo, NULL);
5328                 }
5329
5330               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
5331               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5332                 {
5333                   if (!transform_pattern_stmt && !pattern_def)
5334                     gsi_next (&si);
5335                   continue;
5336                 }
5337             }
5338
5339           /* -------- vectorize statement ------------ */
5340           if (vect_print_dump_info (REPORT_DETAILS))
5341             fprintf (vect_dump, "transform statement.");
5342
5343           strided_store = false;
5344           is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
5345           if (is_store)
5346             {
5347               if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5348                 {
5349                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5350                      interleaving chain was completed - free all the stores in
5351                      the chain.  */
5352                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
5353                   gsi_remove (&si, true);
5354                   continue;
5355                 }
5356               else
5357                 {
5358                   /* Free the attached stmt_vec_info and remove the stmt.  */
5359                   free_stmt_vec_info (stmt);
5360                   gsi_remove (&si, true);
5361                   continue;
5362                 }
5363             }
5364
5365           if (!transform_pattern_stmt && !pattern_def)
5366             gsi_next (&si);
5367         }                       /* stmts in BB */
5368     }                           /* BBs in loop */
5369
5370   slpeel_make_loop_iterate_ntimes (loop, ratio);
5371
5372   /* The memory tags and pointers in vectorized statements need to
5373      have their SSA forms updated.  FIXME, why can't this be delayed
5374      until all the loops have been transformed?  */
5375   update_ssa (TODO_update_ssa);
5376
5377   if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
5378     fprintf (vect_dump, "LOOP VECTORIZED.");
5379   if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
5380     fprintf (vect_dump, "OUTER LOOP VECTORIZED.");
5381 }