gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
   3    Free Software Foundation, Inc.
   4    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   5    Ira Rosen <irar@il.ibm.com>
   6
   7 This file is part of GCC.
   8
   9 GCC is free software; you can redistribute it and/or modify it under
  10 the terms of the GNU General Public License as published by the Free
  11 Software Foundation; either version 3, or (at your option) any later
  12 version.
  13
  14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  16 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  17 for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with GCC; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "tm.h"
  27 #include "ggc.h"
  28 #include "tree.h"
  29 #include "basic-block.h"
  30 #include "tree-pretty-print.h"
  31 #include "gimple-pretty-print.h"
  32 #include "tree-flow.h"
  33 #include "tree-dump.h"
  34 #include "cfgloop.h"
  35 #include "cfglayout.h"
  36 #include "expr.h"
  37 #include "recog.h"
  38 #include "optabs.h"
  39 #include "params.h"
  40 #include "toplev.h"
  41 #include "tree-chrec.h"
  42 #include "tree-scalar-evolution.h"
  43 #include "tree-vectorizer.h"
  44 #include "target.h"
  45
  46 /* Loop Vectorization Pass.
  47
  48    This pass tries to vectorize loops.
  49
  50    For example, the vectorizer transforms the following simple loop:
  51
  52         short a[N]; short b[N]; short c[N]; int i;
  53
  54         for (i=0; i<N; i++){
  55           a[i] = b[i] + c[i];
  56         }
  57
  58    as if it was manually vectorized by rewriting the source code into:
  59
  60         typedef int __attribute__((mode(V8HI))) v8hi;
  61         short a[N];  short b[N]; short c[N];   int i;
  62         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  63         v8hi va, vb, vc;
  64
  65         for (i=0; i<N/8; i++){
  66           vb = pb[i];
  67           vc = pc[i];
  68           va = vb + vc;
  69           pa[i] = va;
  70         }
  71
  72         The main entry to this pass is vectorize_loops(), in which
  73    the vectorizer applies a set of analyses on a given set of loops,
  74    followed by the actual vectorization transformation for the loops that
  75    had successfully passed the analysis phase.
  76         Throughout this pass we make a distinction between two types of
  77    data: scalars (which are represented by SSA_NAMES), and memory references
  78    ("data-refs"). These two types of data require different handling both
  79    during analysis and transformation. The types of data-refs that the
  80    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  81    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  82    accesses are required to have a simple (consecutive) access pattern.
  83
  84    Analysis phase:
  85    ===============
  86         The driver for the analysis phase is vect_analyze_loop().
  87    It applies a set of analyses, some of which rely on the scalar evolution
  88    analyzer (scev) developed by Sebastian Pop.
  89
  90         During the analysis phase the vectorizer records some information
  91    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  92    loop, as well as general information about the loop as a whole, which is
  93    recorded in a "loop_vec_info" struct attached to each loop.
  94
  95    Transformation phase:
  96    =====================
  97         The loop transformation phase scans all the stmts in the loop, and
  98    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
  99    the loop that needs to be vectorized. It inserts the vector code sequence
 100    just before the scalar stmt S, and records a pointer to the vector code
 101    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 102    attached to S). This pointer will be used for the vectorization of following
 103    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 104    otherwise, we rely on dead code elimination for removing it.
 105
 106         For example, say stmt S1 was vectorized into stmt VS1:
 107
 108    VS1: vb = px[i];
 109    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 110    S2:  a = b;
 111
 112    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 113    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 114    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
 115    resulting sequence would be:
 116
 117    VS1: vb = px[i];
 118    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 119    VS2: va = vb;
 120    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 121
 122         Operands that are not SSA_NAMEs, are data-refs that appear in
 123    load/store operations (like 'x[i]' in S1), and are handled differently.
 124
 125    Target modeling:
 126    =================
 127         Currently the only target specific information that is used is the
 128    size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can
 129    support different sizes of vectors, for now will need to specify one value
 130    for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future.
 131
 132         Since we only vectorize operations which vector form can be
 133    expressed using existing tree codes, to verify that an operation is
 134    supported, the vectorizer checks the relevant optab at the relevant
 135    machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
 136    the value found is CODE_FOR_nothing, then there's no target support, and
 137    we can't vectorize the stmt.
 138
 139    For additional information on this project see:
 140    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 141 */
 142
 143 /* Function vect_determine_vectorization_factor
 144
 145    Determine the vectorization factor (VF). VF is the number of data elements
 146    that are operated upon in parallel in a single iteration of the vectorized
 147    loop. For example, when vectorizing a loop that operates on 4byte elements,
 148    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 149    elements can fit in a single vector register.
 150
 151    We currently support vectorization of loops in which all types operated upon
 152    are of the same size. Therefore this function currently sets VF according to
 153    the size of the types operated upon, and fails if there are multiple sizes
 154    in the loop.
 155
 156    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 157    original loop:
 158         for (i=0; i<N; i++){
 159           a[i] = b[i] + c[i];
 160         }
 161
 162    vectorized loop:
 163         for (i=0; i<N; i+=VF){
 164           a[i:VF] = b[i:VF] + c[i:VF];
 165         }
 166 */
 167
 168 static bool
 169 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 170 {
 171   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 172   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 173   int nbbs = loop->num_nodes;
 174   gimple_stmt_iterator si;
 175   unsigned int vectorization_factor = 0;
 176   tree scalar_type;
 177   gimple phi;
 178   tree vectype;
 179   unsigned int nunits;
 180   stmt_vec_info stmt_info;
 181   int i;
 182   HOST_WIDE_INT dummy;
 183
 184   if (vect_print_dump_info (REPORT_DETAILS))
 185     fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
 186
 187   for (i = 0; i < nbbs; i++)
 188     {
 189       basic_block bb = bbs[i];
 190
 191       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 192         {
 193           phi = gsi_stmt (si);
 194           stmt_info = vinfo_for_stmt (phi);
 195           if (vect_print_dump_info (REPORT_DETAILS))
 196             {
 197               fprintf (vect_dump, "==> examining phi: ");
 198               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 199             }
 200
 201           gcc_assert (stmt_info);
 202
 203           if (STMT_VINFO_RELEVANT_P (stmt_info))
 204             {
 205               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 206               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 207
 208               if (vect_print_dump_info (REPORT_DETAILS))
 209                 {
 210                   fprintf (vect_dump, "get vectype for scalar type:  ");
 211                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 212                 }
 213
 214               vectype = get_vectype_for_scalar_type (scalar_type);
 215               if (!vectype)
 216                 {
 217                   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 218                     {
 219                       fprintf (vect_dump,
 220                                "not vectorized: unsupported data-type ");
 221                       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 222                     }
 223                   return false;
 224                 }
 225               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 226
 227               if (vect_print_dump_info (REPORT_DETAILS))
 228                 {
 229                   fprintf (vect_dump, "vectype: ");
 230                   print_generic_expr (vect_dump, vectype, TDF_SLIM);
 231                 }
 232
 233               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 234               if (vect_print_dump_info (REPORT_DETAILS))
 235                 fprintf (vect_dump, "nunits = %d", nunits);
 236
 237               if (!vectorization_factor
 238                   || (nunits > vectorization_factor))
 239                 vectorization_factor = nunits;
 240             }
 241         }
 242
 243       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 244         {
 245           tree vf_vectype;
 246           gimple stmt = gsi_stmt (si);
 247           stmt_info = vinfo_for_stmt (stmt);
 248
 249           if (vect_print_dump_info (REPORT_DETAILS))
 250             {
 251               fprintf (vect_dump, "==> examining statement: ");
 252               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 253             }
 254
 255           gcc_assert (stmt_info);
 256
 257           /* skip stmts which do not need to be vectorized.  */
 258           if (!STMT_VINFO_RELEVANT_P (stmt_info)
 259               && !STMT_VINFO_LIVE_P (stmt_info))
 260             {
 261               if (vect_print_dump_info (REPORT_DETAILS))
 262                 fprintf (vect_dump, "skip.");
 263               continue;
 264             }
 265
 266           if (gimple_get_lhs (stmt) == NULL_TREE)
 267             {
 268               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 269                 {
 270                   fprintf (vect_dump, "not vectorized: irregular stmt.");
 271                   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 272                 }
 273               return false;
 274             }
 275
 276           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 277             {
 278               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 279                 {
 280                   fprintf (vect_dump, "not vectorized: vector stmt in loop:");
 281                   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 282                 }
 283               return false;
 284             }
 285
 286           if (STMT_VINFO_VECTYPE (stmt_info))
 287             {
 288               /* The only case when a vectype had been already set is for stmts
 289                  that contain a dataref, or for "pattern-stmts" (stmts generated
 290                  by the vectorizer to represent/replace a certain idiom).  */
 291               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 292                           || is_pattern_stmt_p (stmt_info));
 293               vectype = STMT_VINFO_VECTYPE (stmt_info);
 294             }
 295           else
 296             {
 297               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
 298                           && !is_pattern_stmt_p (stmt_info));
 299
 300               scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 301               if (vect_print_dump_info (REPORT_DETAILS))
 302                 {
 303                   fprintf (vect_dump, "get vectype for scalar type:  ");
 304                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 305                 }
 306               vectype = get_vectype_for_scalar_type (scalar_type);
 307               if (!vectype)
 308                 {
 309                   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 310                     {
 311                       fprintf (vect_dump,
 312                                "not vectorized: unsupported data-type ");
 313                       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 314                     }
 315                   return false;
 316                 }
 317
 318               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 319             }
 320
 321           /* The vectorization factor is according to the smallest
 322              scalar type (or the largest vector size, but we only
 323              support one vector size per loop).  */
 324           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 325                                                        &dummy);
 326           if (vect_print_dump_info (REPORT_DETAILS))
 327             {
 328               fprintf (vect_dump, "get vectype for scalar type:  ");
 329               print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 330             }
 331           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 332           if (!vf_vectype)
 333             {
 334               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 335                 {
 336                   fprintf (vect_dump,
 337                            "not vectorized: unsupported data-type ");
 338                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 339                 }
 340               return false;
 341             }
 342
 343           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 344                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 345             {
 346               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 347                 {
 348                   fprintf (vect_dump,
 349                            "not vectorized: different sized vector "
 350                            "types in statement, ");
 351                   print_generic_expr (vect_dump, vectype, TDF_SLIM);
 352                   fprintf (vect_dump, " and ");
 353                   print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
 354                 }
 355               return false;
 356             }
 357
 358           if (vect_print_dump_info (REPORT_DETAILS))
 359             {
 360               fprintf (vect_dump, "vectype: ");
 361               print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
 362             }
 363
 364           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 365           if (vect_print_dump_info (REPORT_DETAILS))
 366             fprintf (vect_dump, "nunits = %d", nunits);
 367
 368           if (!vectorization_factor
 369               || (nunits > vectorization_factor))
 370             vectorization_factor = nunits;
 371         }
 372     }
 373
 374   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 375   if (vect_print_dump_info (REPORT_DETAILS))
 376     fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
 377   if (vectorization_factor <= 1)
 378     {
 379       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 380         fprintf (vect_dump, "not vectorized: unsupported data-type");
 381       return false;
 382     }
 383   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 384
 385   return true;
 386 }
 387
 388
 389 /* Function vect_is_simple_iv_evolution.
 390
 391    FORNOW: A simple evolution of an induction variables in the loop is
 392    considered a polynomial evolution with constant step.  */
 393
 394 static bool
 395 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 396                              tree * step)
 397 {
 398   tree init_expr;
 399   tree step_expr;
 400   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 401
 402   /* When there is no evolution in this loop, the evolution function
 403      is not "simple".  */
 404   if (evolution_part == NULL_TREE)
 405     return false;
 406
 407   /* When the evolution is a polynomial of degree >= 2
 408      the evolution function is not "simple".  */
 409   if (tree_is_chrec (evolution_part))
 410     return false;
 411
 412   step_expr = evolution_part;
 413   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 414
 415   if (vect_print_dump_info (REPORT_DETAILS))
 416     {
 417       fprintf (vect_dump, "step: ");
 418       print_generic_expr (vect_dump, step_expr, TDF_SLIM);
 419       fprintf (vect_dump, ",  init: ");
 420       print_generic_expr (vect_dump, init_expr, TDF_SLIM);
 421     }
 422
 423   *init = init_expr;
 424   *step = step_expr;
 425
 426   if (TREE_CODE (step_expr) != INTEGER_CST)
 427     {
 428       if (vect_print_dump_info (REPORT_DETAILS))
 429         fprintf (vect_dump, "step unknown.");
 430       return false;
 431     }
 432
 433   return true;
 434 }
 435
 436 /* Function vect_analyze_scalar_cycles_1.
 437
 438    Examine the cross iteration def-use cycles of scalar variables
 439    in LOOP. LOOP_VINFO represents the loop that is now being
 440    considered for vectorization (can be LOOP, or an outer-loop
 441    enclosing LOOP).  */
 442
 443 static void
 444 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 445 {
 446   basic_block bb = loop->header;
 447   tree dumy;
 448   VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
 449   gimple_stmt_iterator gsi;
 450   bool double_reduc;
 451
 452   if (vect_print_dump_info (REPORT_DETAILS))
 453     fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
 454
 455   /* First - identify all inductions. Reduction detection assumes that all the
 456      inductions have been identified, therefore, this order must not be
 457      changed.  */
 458   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 459     {
 460       gimple phi = gsi_stmt (gsi);
 461       tree access_fn = NULL;
 462       tree def = PHI_RESULT (phi);
 463       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 464
 465       if (vect_print_dump_info (REPORT_DETAILS))
 466         {
 467           fprintf (vect_dump, "Analyze phi: ");
 468           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 469         }
 470
 471       /* Skip virtual phi's. The data dependences that are associated with
 472          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 473       if (!is_gimple_reg (SSA_NAME_VAR (def)))
 474         continue;
 475
 476       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 477
 478       /* Analyze the evolution function.  */
 479       access_fn = analyze_scalar_evolution (loop, def);
 480       if (access_fn && vect_print_dump_info (REPORT_DETAILS))
 481         {
 482           fprintf (vect_dump, "Access function of PHI: ");
 483           print_generic_expr (vect_dump, access_fn, TDF_SLIM);
 484         }
 485
 486       if (!access_fn
 487           || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
 488         {
 489           VEC_safe_push (gimple, heap, worklist, phi);
 490           continue;
 491         }
 492
 493       if (vect_print_dump_info (REPORT_DETAILS))
 494         fprintf (vect_dump, "Detected induction.");
 495       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 496     }
 497
 498
 499   /* Second - identify all reductions and nested cycles.  */
 500   while (VEC_length (gimple, worklist) > 0)
 501     {
 502       gimple phi = VEC_pop (gimple, worklist);
 503       tree def = PHI_RESULT (phi);
 504       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 505       gimple reduc_stmt;
 506       bool nested_cycle;
 507
 508       if (vect_print_dump_info (REPORT_DETAILS))
 509         {
 510           fprintf (vect_dump, "Analyze phi: ");
 511           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 512         }
 513
 514       gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
 515       gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 516
 517       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 518       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 519                                                 &double_reduc);
 520       if (reduc_stmt)
 521         {
 522           if (double_reduc)
 523             {
 524               if (vect_print_dump_info (REPORT_DETAILS))
 525                 fprintf (vect_dump, "Detected double reduction.");
 526
 527               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 528               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 529                                                     vect_double_reduction_def;
 530             }
 531           else
 532             {
 533               if (nested_cycle)
 534                 {
 535                   if (vect_print_dump_info (REPORT_DETAILS))
 536                     fprintf (vect_dump, "Detected vectorizable nested cycle.");
 537
 538                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 539                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 540                                                              vect_nested_cycle;
 541                 }
 542               else
 543                 {
 544                   if (vect_print_dump_info (REPORT_DETAILS))
 545                     fprintf (vect_dump, "Detected reduction.");
 546
 547                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 548                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 549                                                            vect_reduction_def;
 550                   /* Store the reduction cycles for possible vectorization in
 551                      loop-aware SLP.  */
 552                   VEC_safe_push (gimple, heap,
 553                                  LOOP_VINFO_REDUCTIONS (loop_vinfo),
 554                                  reduc_stmt);
 555                 }
 556             }
 557         }
 558       else
 559         if (vect_print_dump_info (REPORT_DETAILS))
 560           fprintf (vect_dump, "Unknown def-use cycle pattern.");
 561     }
 562
 563   VEC_free (gimple, heap, worklist);
 564 }
 565
 566
 567 /* Function vect_analyze_scalar_cycles.
 568
 569    Examine the cross iteration def-use cycles of scalar variables, by
 570    analyzing the loop-header PHIs of scalar variables; Classify each
 571    cycle as one of the following: invariant, induction, reduction, unknown.
 572    We do that for the loop represented by LOOP_VINFO, and also to its
 573    inner-loop, if exists.
 574    Examples for scalar cycles:
 575
 576    Example1: reduction:
 577
 578               loop1:
 579               for (i=0; i<N; i++)
 580                  sum += a[i];
 581
 582    Example2: induction:
 583
 584               loop2:
 585               for (i=0; i<N; i++)
 586                  a[i] = i;  */
 587
 588 static void
 589 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 590 {
 591   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 592
 593   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 594
 595   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 596      Reductions in such inner-loop therefore have different properties than
 597      the reductions in the nest that gets vectorized:
 598      1. When vectorized, they are executed in the same order as in the original
 599         scalar loop, so we can't change the order of computation when
 600         vectorizing them.
 601      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 602         current checks are too strict.  */
 603
 604   if (loop->inner)
 605     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 606 }
 607
 608 /* Function vect_get_loop_niters.
 609
 610    Determine how many iterations the loop is executed.
 611    If an expression that represents the number of iterations
 612    can be constructed, place it in NUMBER_OF_ITERATIONS.
 613    Return the loop exit condition.  */
 614
 615 static gimple
 616 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 617 {
 618   tree niters;
 619
 620   if (vect_print_dump_info (REPORT_DETAILS))
 621     fprintf (vect_dump, "=== get_loop_niters ===");
 622
 623   niters = number_of_exit_cond_executions (loop);
 624
 625   if (niters != NULL_TREE
 626       && niters != chrec_dont_know)
 627     {
 628       *number_of_iterations = niters;
 629
 630       if (vect_print_dump_info (REPORT_DETAILS))
 631         {
 632           fprintf (vect_dump, "==> get_loop_niters:" );
 633           print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
 634         }
 635     }
 636
 637   return get_loop_exit_condition (loop);
 638 }
 639
 640
 641 /* Function bb_in_loop_p
 642
 643    Used as predicate for dfs order traversal of the loop bbs.  */
 644
 645 static bool
 646 bb_in_loop_p (const_basic_block bb, const void *data)
 647 {
 648   const struct loop *const loop = (const struct loop *)data;
 649   if (flow_bb_inside_loop_p (loop, bb))
 650     return true;
 651   return false;
 652 }
 653
 654
 655 /* Function new_loop_vec_info.
 656
 657    Create and initialize a new loop_vec_info struct for LOOP, as well as
 658    stmt_vec_info structs for all the stmts in LOOP.  */
 659
 660 static loop_vec_info
 661 new_loop_vec_info (struct loop *loop)
 662 {
 663   loop_vec_info res;
 664   basic_block *bbs;
 665   gimple_stmt_iterator si;
 666   unsigned int i, nbbs;
 667
 668   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 669   LOOP_VINFO_LOOP (res) = loop;
 670
 671   bbs = get_loop_body (loop);
 672
 673   /* Create/Update stmt_info for all stmts in the loop.  */
 674   for (i = 0; i < loop->num_nodes; i++)
 675     {
 676       basic_block bb = bbs[i];
 677
 678       /* BBs in a nested inner-loop will have been already processed (because
 679          we will have called vect_analyze_loop_form for any nested inner-loop).
 680          Therefore, for stmts in an inner-loop we just want to update the
 681          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 682          loop_info of the outer-loop we are currently considering to vectorize
 683          (instead of the loop_info of the inner-loop).
 684          For stmts in other BBs we need to create a stmt_info from scratch.  */
 685       if (bb->loop_father != loop)
 686         {
 687           /* Inner-loop bb.  */
 688           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 689           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 690             {
 691               gimple phi = gsi_stmt (si);
 692               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 693               loop_vec_info inner_loop_vinfo =
 694                 STMT_VINFO_LOOP_VINFO (stmt_info);
 695               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 696               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 697             }
 698           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 699            {
 700               gimple stmt = gsi_stmt (si);
 701               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 702               loop_vec_info inner_loop_vinfo =
 703                  STMT_VINFO_LOOP_VINFO (stmt_info);
 704               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 705               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 706            }
 707         }
 708       else
 709         {
 710           /* bb in current nest.  */
 711           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 712             {
 713               gimple phi = gsi_stmt (si);
 714               gimple_set_uid (phi, 0);
 715               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 716             }
 717
 718           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 719             {
 720               gimple stmt = gsi_stmt (si);
 721               gimple_set_uid (stmt, 0);
 722               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 723             }
 724         }
 725     }
 726
 727   /* CHECKME: We want to visit all BBs before their successors (except for
 728      latch blocks, for which this assertion wouldn't hold).  In the simple
 729      case of the loop forms we allow, a dfs order of the BBs would the same
 730      as reversed postorder traversal, so we are safe.  */
 731
 732    free (bbs);
 733    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 734    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 735                               bbs, loop->num_nodes, loop);
 736    gcc_assert (nbbs == loop->num_nodes);
 737
 738   LOOP_VINFO_BBS (res) = bbs;
 739   LOOP_VINFO_NITERS (res) = NULL;
 740   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 741   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 742   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 743   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
 744   LOOP_VINFO_VECT_FACTOR (res) = 0;
 745   LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
 746   LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
 747   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 748   LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
 749     VEC_alloc (gimple, heap,
 750                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 751   LOOP_VINFO_MAY_ALIAS_DDRS (res) =
 752     VEC_alloc (ddr_p, heap,
 753                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 754   LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
 755   LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
 756   LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
 757   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 758   LOOP_VINFO_PEELING_HTAB (res) = NULL;
 759
 760   return res;
 761 }
 762
 763
 764 /* Function destroy_loop_vec_info.
 765
 766    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 767    stmts in the loop.  */
 768
 769 void
 770 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 771 {
 772   struct loop *loop;
 773   basic_block *bbs;
 774   int nbbs;
 775   gimple_stmt_iterator si;
 776   int j;
 777   VEC (slp_instance, heap) *slp_instances;
 778   slp_instance instance;
 779
 780   if (!loop_vinfo)
 781     return;
 782
 783   loop = LOOP_VINFO_LOOP (loop_vinfo);
 784
 785   bbs = LOOP_VINFO_BBS (loop_vinfo);
 786   nbbs = loop->num_nodes;
 787
 788   if (!clean_stmts)
 789     {
 790       free (LOOP_VINFO_BBS (loop_vinfo));
 791       free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 792       free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 793       VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 794
 795       free (loop_vinfo);
 796       loop->aux = NULL;
 797       return;
 798     }
 799
 800   for (j = 0; j < nbbs; j++)
 801     {
 802       basic_block bb = bbs[j];
 803       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 804         free_stmt_vec_info (gsi_stmt (si));
 805
 806       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 807         {
 808           gimple stmt = gsi_stmt (si);
 809           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 810
 811           if (stmt_info)
 812             {
 813               /* Check if this is a "pattern stmt" (introduced by the
 814                  vectorizer during the pattern recognition pass).  */
 815               bool remove_stmt_p = false;
 816               gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
 817               if (orig_stmt)
 818                 {
 819                   stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
 820                   if (orig_stmt_info
 821                       && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
 822                     remove_stmt_p = true;
 823                 }
 824
 825               /* Free stmt_vec_info.  */
 826               free_stmt_vec_info (stmt);
 827
 828               /* Remove dead "pattern stmts".  */
 829               if (remove_stmt_p)
 830                 gsi_remove (&si, true);
 831             }
 832           gsi_next (&si);
 833         }
 834     }
 835
 836   free (LOOP_VINFO_BBS (loop_vinfo));
 837   free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 838   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 839   VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 840   VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
 841   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
 842   for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
 843     vect_free_slp_instance (instance);
 844
 845   VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
 846   VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
 847   VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
 848
 849   if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
 850     htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
 851
 852   free (loop_vinfo);
 853   loop->aux = NULL;
 854 }
 855
 856
 857 /* Function vect_analyze_loop_1.
 858
 859    Apply a set of analyses on LOOP, and create a loop_vec_info struct
 860    for it. The different analyses will record information in the
 861    loop_vec_info struct.  This is a subset of the analyses applied in
 862    vect_analyze_loop, to be applied on an inner-loop nested in the loop
 863    that is now considered for (outer-loop) vectorization.  */
 864
 865 static loop_vec_info
 866 vect_analyze_loop_1 (struct loop *loop)
 867 {
 868   loop_vec_info loop_vinfo;
 869
 870   if (vect_print_dump_info (REPORT_DETAILS))
 871     fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
 872
 873   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
 874
 875   loop_vinfo = vect_analyze_loop_form (loop);
 876   if (!loop_vinfo)
 877     {
 878       if (vect_print_dump_info (REPORT_DETAILS))
 879         fprintf (vect_dump, "bad inner-loop form.");
 880       return NULL;
 881     }
 882
 883   return loop_vinfo;
 884 }
 885
 886
 887 /* Function vect_analyze_loop_form.
 888
 889    Verify that certain CFG restrictions hold, including:
 890    - the loop has a pre-header
 891    - the loop has a single entry and exit
 892    - the loop exit condition is simple enough, and the number of iterations
 893      can be analyzed (a countable loop).  */
 894
 895 loop_vec_info
 896 vect_analyze_loop_form (struct loop *loop)
 897 {
 898   loop_vec_info loop_vinfo;
 899   gimple loop_cond;
 900   tree number_of_iterations = NULL;
 901   loop_vec_info inner_loop_vinfo = NULL;
 902
 903   if (vect_print_dump_info (REPORT_DETAILS))
 904     fprintf (vect_dump, "=== vect_analyze_loop_form ===");
 905
 906   /* Different restrictions apply when we are considering an inner-most loop,
 907      vs. an outer (nested) loop.
 908      (FORNOW. May want to relax some of these restrictions in the future).  */
 909
 910   if (!loop->inner)
 911     {
 912       /* Inner-most loop.  We currently require that the number of BBs is
 913          exactly 2 (the header and latch).  Vectorizable inner-most loops
 914          look like this:
 915
 916                         (pre-header)
 917                            |
 918                           header <--------+
 919                            | |            |
 920                            | +--> latch --+
 921                            |
 922                         (exit-bb)  */
 923
 924       if (loop->num_nodes != 2)
 925         {
 926           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 927             fprintf (vect_dump, "not vectorized: control flow in loop.");
 928           return NULL;
 929         }
 930
 931       if (empty_block_p (loop->header))
 932     {
 933           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 934             fprintf (vect_dump, "not vectorized: empty loop.");
 935       return NULL;
 936     }
 937     }
 938   else
 939     {
 940       struct loop *innerloop = loop->inner;
 941       edge entryedge;
 942
 943       /* Nested loop. We currently require that the loop is doubly-nested,
 944          contains a single inner loop, and the number of BBs is exactly 5.
 945          Vectorizable outer-loops look like this:
 946
 947                         (pre-header)
 948                            |
 949                           header <---+
 950                            |         |
 951                           inner-loop |
 952                            |         |
 953                           tail ------+
 954                            |
 955                         (exit-bb)
 956
 957          The inner-loop has the properties expected of inner-most loops
 958          as described above.  */
 959
 960       if ((loop->inner)->inner || (loop->inner)->next)
 961         {
 962           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 963             fprintf (vect_dump, "not vectorized: multiple nested loops.");
 964           return NULL;
 965         }
 966
 967       /* Analyze the inner-loop.  */
 968       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
 969       if (!inner_loop_vinfo)
 970         {
 971           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 972             fprintf (vect_dump, "not vectorized: Bad inner loop.");
 973           return NULL;
 974         }
 975
 976       if (!expr_invariant_in_loop_p (loop,
 977                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
 978         {
 979           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 980             fprintf (vect_dump,
 981                      "not vectorized: inner-loop count not invariant.");
 982           destroy_loop_vec_info (inner_loop_vinfo, true);
 983           return NULL;
 984         }
 985
 986       if (loop->num_nodes != 5)
 987         {
 988           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 989             fprintf (vect_dump, "not vectorized: control flow in loop.");
 990           destroy_loop_vec_info (inner_loop_vinfo, true);
 991           return NULL;
 992         }
 993
 994       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
 995       entryedge = EDGE_PRED (innerloop->header, 0);
 996       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
 997         entryedge = EDGE_PRED (innerloop->header, 1);
 998
 999       if (entryedge->src != loop->header
1000           || !single_exit (innerloop)
1001           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1002         {
1003           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1004             fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
1005           destroy_loop_vec_info (inner_loop_vinfo, true);
1006           return NULL;
1007         }
1008
1009       if (vect_print_dump_info (REPORT_DETAILS))
1010         fprintf (vect_dump, "Considering outer-loop vectorization.");
1011     }
1012
1013   if (!single_exit (loop)
1014       || EDGE_COUNT (loop->header->preds) != 2)
1015     {
1016       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1017         {
1018           if (!single_exit (loop))
1019             fprintf (vect_dump, "not vectorized: multiple exits.");
1020           else if (EDGE_COUNT (loop->header->preds) != 2)
1021             fprintf (vect_dump, "not vectorized: too many incoming edges.");
1022         }
1023       if (inner_loop_vinfo)
1024         destroy_loop_vec_info (inner_loop_vinfo, true);
1025       return NULL;
1026     }
1027
1028   /* We assume that the loop exit condition is at the end of the loop. i.e,
1029      that the loop is represented as a do-while (with a proper if-guard
1030      before the loop if needed), where the loop header contains all the
1031      executable statements, and the latch is empty.  */
1032   if (!empty_block_p (loop->latch)
1033         || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1034     {
1035       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1036         fprintf (vect_dump, "not vectorized: unexpected loop form.");
1037       if (inner_loop_vinfo)
1038         destroy_loop_vec_info (inner_loop_vinfo, true);
1039       return NULL;
1040     }
1041
1042   /* Make sure there exists a single-predecessor exit bb:  */
1043   if (!single_pred_p (single_exit (loop)->dest))
1044     {
1045       edge e = single_exit (loop);
1046       if (!(e->flags & EDGE_ABNORMAL))
1047         {
1048           split_loop_exit_edge (e);
1049           if (vect_print_dump_info (REPORT_DETAILS))
1050             fprintf (vect_dump, "split exit edge.");
1051         }
1052       else
1053         {
1054           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1055             fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1056           if (inner_loop_vinfo)
1057             destroy_loop_vec_info (inner_loop_vinfo, true);
1058           return NULL;
1059         }
1060     }
1061
1062   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1063   if (!loop_cond)
1064     {
1065       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1066         fprintf (vect_dump, "not vectorized: complicated exit condition.");
1067       if (inner_loop_vinfo)
1068         destroy_loop_vec_info (inner_loop_vinfo, true);
1069       return NULL;
1070     }
1071
1072   if (!number_of_iterations)
1073     {
1074       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1075         fprintf (vect_dump,
1076                  "not vectorized: number of iterations cannot be computed.");
1077       if (inner_loop_vinfo)
1078         destroy_loop_vec_info (inner_loop_vinfo, true);
1079       return NULL;
1080     }
1081
1082   if (chrec_contains_undetermined (number_of_iterations))
1083     {
1084       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1085         fprintf (vect_dump, "Infinite number of iterations.");
1086       if (inner_loop_vinfo)
1087         destroy_loop_vec_info (inner_loop_vinfo, true);
1088       return NULL;
1089     }
1090
1091   if (!NITERS_KNOWN_P (number_of_iterations))
1092     {
1093       if (vect_print_dump_info (REPORT_DETAILS))
1094         {
1095           fprintf (vect_dump, "Symbolic number of iterations is ");
1096           print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1097         }
1098     }
1099   else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1100     {
1101       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1102         fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1103       if (inner_loop_vinfo)
1104         destroy_loop_vec_info (inner_loop_vinfo, false);
1105       return NULL;
1106     }
1107
1108   loop_vinfo = new_loop_vec_info (loop);
1109   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1110   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1111
1112   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1113
1114   /* CHECKME: May want to keep it around it in the future.  */
1115   if (inner_loop_vinfo)
1116     destroy_loop_vec_info (inner_loop_vinfo, false);
1117
1118   gcc_assert (!loop->aux);
1119   loop->aux = loop_vinfo;
1120   return loop_vinfo;
1121 }
1122
1123
1124 /* Get cost by calling cost target builtin.  */
1125
1126 static inline
1127 int vect_get_cost (enum vect_cost_for_stmt type_of_cost)
1128 {
1129   tree dummy_type = NULL;
1130   int dummy = 0;
1131
1132   return targetm.vectorize.builtin_vectorization_cost (type_of_cost,
1133                                                        dummy_type, dummy);
1134 }
1135
1136
1137 /* Function vect_analyze_loop_operations.
1138
1139    Scan the loop stmts and make sure they are all vectorizable.  */
1140
1141 static bool
1142 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1143 {
1144   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1145   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1146   int nbbs = loop->num_nodes;
1147   gimple_stmt_iterator si;
1148   unsigned int vectorization_factor = 0;
1149   int i;
1150   gimple phi;
1151   stmt_vec_info stmt_info;
1152   bool need_to_vectorize = false;
1153   int min_profitable_iters;
1154   int min_scalar_loop_bound;
1155   unsigned int th;
1156   bool only_slp_in_loop = true, ok;
1157
1158   if (vect_print_dump_info (REPORT_DETAILS))
1159     fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1160
1161   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1162   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1163
1164   for (i = 0; i < nbbs; i++)
1165     {
1166       basic_block bb = bbs[i];
1167
1168       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1169         {
1170           phi = gsi_stmt (si);
1171           ok = true;
1172
1173           stmt_info = vinfo_for_stmt (phi);
1174           if (vect_print_dump_info (REPORT_DETAILS))
1175             {
1176               fprintf (vect_dump, "examining phi: ");
1177               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1178             }
1179
1180           if (! is_loop_header_bb_p (bb))
1181             {
1182               /* inner-loop loop-closed exit phi in outer-loop vectorization
1183                  (i.e. a phi in the tail of the outer-loop).
1184                  FORNOW: we currently don't support the case that these phis
1185                  are not used in the outerloop (unless it is double reduction,
1186                  i.e., this phi is vect_reduction_def), cause this case
1187                  requires to actually do something here.  */
1188               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1189                    || STMT_VINFO_LIVE_P (stmt_info))
1190                   && STMT_VINFO_DEF_TYPE (stmt_info)
1191                      != vect_double_reduction_def)
1192                 {
1193                   if (vect_print_dump_info (REPORT_DETAILS))
1194                     fprintf (vect_dump,
1195                              "Unsupported loop-closed phi in outer-loop.");
1196                   return false;
1197                 }
1198               continue;
1199             }
1200
1201           gcc_assert (stmt_info);
1202
1203           if (STMT_VINFO_LIVE_P (stmt_info))
1204             {
1205               /* FORNOW: not yet supported.  */
1206               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1207                 fprintf (vect_dump, "not vectorized: value used after loop.");
1208               return false;
1209             }
1210
1211           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1212               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1213             {
1214               /* A scalar-dependence cycle that we don't support.  */
1215               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1216                 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1217               return false;
1218             }
1219
1220           if (STMT_VINFO_RELEVANT_P (stmt_info))
1221             {
1222               need_to_vectorize = true;
1223               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1224                 ok = vectorizable_induction (phi, NULL, NULL);
1225             }
1226
1227           if (!ok)
1228             {
1229               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1230                 {
1231                   fprintf (vect_dump,
1232                            "not vectorized: relevant phi not supported: ");
1233                   print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1234                 }
1235               return false;
1236             }
1237         }
1238
1239       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1240         {
1241           gimple stmt = gsi_stmt (si);
1242           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1243
1244           gcc_assert (stmt_info);
1245
1246           if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1247             return false;
1248
1249           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1250                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1251               && !PURE_SLP_STMT (stmt_info))
1252             /* STMT needs both SLP and loop-based vectorization.  */
1253             only_slp_in_loop = false;
1254         }
1255     } /* bbs */
1256
1257   /* All operations in the loop are either irrelevant (deal with loop
1258      control, or dead), or only used outside the loop and can be moved
1259      out of the loop (e.g. invariants, inductions).  The loop can be
1260      optimized away by scalar optimizations.  We're better off not
1261      touching this loop.  */
1262   if (!need_to_vectorize)
1263     {
1264       if (vect_print_dump_info (REPORT_DETAILS))
1265         fprintf (vect_dump,
1266                  "All the computation can be taken out of the loop.");
1267       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1268         fprintf (vect_dump,
1269                  "not vectorized: redundant loop. no profit to vectorize.");
1270       return false;
1271     }
1272
1273   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1274      vectorization factor of the loop is the unrolling factor required by the
1275      SLP instances.  If that unrolling factor is 1, we say, that we perform
1276      pure SLP on loop - cross iteration parallelism is not exploited.  */
1277   if (only_slp_in_loop)
1278     vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1279   else
1280     vectorization_factor = least_common_multiple (vectorization_factor,
1281                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1282
1283   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1284
1285   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1286       && vect_print_dump_info (REPORT_DETAILS))
1287     fprintf (vect_dump,
1288         "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1289         vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1290
1291   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1292       && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1293     {
1294       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1295         fprintf (vect_dump, "not vectorized: iteration count too small.");
1296       if (vect_print_dump_info (REPORT_DETAILS))
1297         fprintf (vect_dump,"not vectorized: iteration count smaller than "
1298                  "vectorization factor.");
1299       return false;
1300     }
1301
1302   /* Analyze cost. Decide if worth while to vectorize.  */
1303
1304   /* Once VF is set, SLP costs should be updated since the number of created
1305      vector stmts depends on VF.  */
1306   vect_update_slp_costs_according_to_vf (loop_vinfo);
1307
1308   min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1309   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1310
1311   if (min_profitable_iters < 0)
1312     {
1313       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1314         fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1315       if (vect_print_dump_info (REPORT_DETAILS))
1316         fprintf (vect_dump, "not vectorized: vector version will never be "
1317                  "profitable.");
1318       return false;
1319     }
1320
1321   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1322                             * vectorization_factor) - 1);
1323
1324   /* Use the cost model only if it is more conservative than user specified
1325      threshold.  */
1326
1327   th = (unsigned) min_scalar_loop_bound;
1328   if (min_profitable_iters
1329       && (!min_scalar_loop_bound
1330           || min_profitable_iters > min_scalar_loop_bound))
1331     th = (unsigned) min_profitable_iters;
1332
1333   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1334       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1335     {
1336       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1337         fprintf (vect_dump, "not vectorized: vectorization not "
1338                  "profitable.");
1339       if (vect_print_dump_info (REPORT_DETAILS))
1340         fprintf (vect_dump, "not vectorized: iteration count smaller than "
1341                  "user specified loop bound parameter or minimum "
1342                  "profitable iterations (whichever is more conservative).");
1343       return false;
1344     }
1345
1346   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1347       || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1348       || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1349     {
1350       if (vect_print_dump_info (REPORT_DETAILS))
1351         fprintf (vect_dump, "epilog loop required.");
1352       if (!vect_can_advance_ivs_p (loop_vinfo))
1353         {
1354           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1355             fprintf (vect_dump,
1356                      "not vectorized: can't create epilog loop 1.");
1357           return false;
1358         }
1359       if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1360         {
1361           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1362             fprintf (vect_dump,
1363                      "not vectorized: can't create epilog loop 2.");
1364           return false;
1365         }
1366     }
1367
1368   return true;
1369 }
1370
1371
1372 /* Function vect_analyze_loop.
1373
1374    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1375    for it. The different analyses will record information in the
1376    loop_vec_info struct.  */
1377 loop_vec_info
1378 vect_analyze_loop (struct loop *loop)
1379 {
1380   bool ok;
1381   loop_vec_info loop_vinfo;
1382   int max_vf = MAX_VECTORIZATION_FACTOR;
1383   int min_vf = 2;
1384
1385   if (vect_print_dump_info (REPORT_DETAILS))
1386     fprintf (vect_dump, "===== analyze_loop_nest =====");
1387
1388   if (loop_outer (loop)
1389       && loop_vec_info_for_loop (loop_outer (loop))
1390       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1391     {
1392       if (vect_print_dump_info (REPORT_DETAILS))
1393         fprintf (vect_dump, "outer-loop already vectorized.");
1394       return NULL;
1395     }
1396
1397   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1398
1399   loop_vinfo = vect_analyze_loop_form (loop);
1400   if (!loop_vinfo)
1401     {
1402       if (vect_print_dump_info (REPORT_DETAILS))
1403         fprintf (vect_dump, "bad loop form.");
1404       return NULL;
1405     }
1406
1407   /* Find all data references in the loop (which correspond to vdefs/vuses)
1408      and analyze their evolution in the loop.  Also adjust the minimal
1409      vectorization factor according to the loads and stores.
1410
1411      FORNOW: Handle only simple, array references, which
1412      alignment can be forced, and aligned pointer-references.  */
1413
1414   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1415   if (!ok)
1416     {
1417       if (vect_print_dump_info (REPORT_DETAILS))
1418         fprintf (vect_dump, "bad data references.");
1419       destroy_loop_vec_info (loop_vinfo, true);
1420       return NULL;
1421     }
1422
1423   /* Classify all cross-iteration scalar data-flow cycles.
1424      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1425
1426   vect_analyze_scalar_cycles (loop_vinfo);
1427
1428   vect_pattern_recog (loop_vinfo);
1429
1430   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1431
1432   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1433   if (!ok)
1434     {
1435       if (vect_print_dump_info (REPORT_DETAILS))
1436         fprintf (vect_dump, "unexpected pattern.");
1437       destroy_loop_vec_info (loop_vinfo, true);
1438       return NULL;
1439     }
1440
1441   /* Analyze data dependences between the data-refs in the loop
1442      and adjust the maximum vectorization factor according to
1443      the dependences.
1444      FORNOW: fail at the first data dependence that we encounter.  */
1445
1446   ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf);
1447   if (!ok
1448       || max_vf < min_vf)
1449     {
1450       if (vect_print_dump_info (REPORT_DETAILS))
1451         fprintf (vect_dump, "bad data dependence.");
1452       destroy_loop_vec_info (loop_vinfo, true);
1453       return NULL;
1454     }
1455
1456   ok = vect_determine_vectorization_factor (loop_vinfo);
1457   if (!ok)
1458     {
1459       if (vect_print_dump_info (REPORT_DETAILS))
1460         fprintf (vect_dump, "can't determine vectorization factor.");
1461       destroy_loop_vec_info (loop_vinfo, true);
1462       return NULL;
1463     }
1464   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1465     {
1466       if (vect_print_dump_info (REPORT_DETAILS))
1467         fprintf (vect_dump, "bad data dependence.");
1468       destroy_loop_vec_info (loop_vinfo, true);
1469       return NULL;
1470     }
1471
1472   /* Analyze the alignment of the data-refs in the loop.
1473      Fail if a data reference is found that cannot be vectorized.  */
1474
1475   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1476   if (!ok)
1477     {
1478       if (vect_print_dump_info (REPORT_DETAILS))
1479         fprintf (vect_dump, "bad data alignment.");
1480       destroy_loop_vec_info (loop_vinfo, true);
1481       return NULL;
1482     }
1483
1484   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1485      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1486
1487   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1488   if (!ok)
1489     {
1490       if (vect_print_dump_info (REPORT_DETAILS))
1491         fprintf (vect_dump, "bad data access.");
1492       destroy_loop_vec_info (loop_vinfo, true);
1493       return NULL;
1494     }
1495
1496   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1497      It is important to call pruning after vect_analyze_data_ref_accesses,
1498      since we use grouping information gathered by interleaving analysis.  */
1499   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1500   if (!ok)
1501     {
1502       if (vect_print_dump_info (REPORT_DETAILS))
1503         fprintf (vect_dump, "too long list of versioning for alias "
1504                             "run-time tests.");
1505       destroy_loop_vec_info (loop_vinfo, true);
1506       return NULL;
1507     }
1508
1509   /* This pass will decide on using loop versioning and/or loop peeling in
1510      order to enhance the alignment of data references in the loop.  */
1511
1512   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1513   if (!ok)
1514     {
1515       if (vect_print_dump_info (REPORT_DETAILS))
1516         fprintf (vect_dump, "bad data alignment.");
1517       destroy_loop_vec_info (loop_vinfo, true);
1518       return NULL;
1519     }
1520
1521   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1522   ok = vect_analyze_slp (loop_vinfo, NULL);
1523   if (ok)
1524     {
1525       /* Decide which possible SLP instances to SLP.  */
1526       vect_make_slp_decision (loop_vinfo);
1527
1528       /* Find stmts that need to be both vectorized and SLPed.  */
1529       vect_detect_hybrid_slp (loop_vinfo);
1530     }
1531
1532   /* Scan all the operations in the loop and make sure they are
1533      vectorizable.  */
1534
1535   ok = vect_analyze_loop_operations (loop_vinfo);
1536   if (!ok)
1537     {
1538       if (vect_print_dump_info (REPORT_DETAILS))
1539         fprintf (vect_dump, "bad operation or unsupported loop bound.");
1540       destroy_loop_vec_info (loop_vinfo, true);
1541       return NULL;
1542     }
1543
1544   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1545
1546   return loop_vinfo;
1547 }
1548
1549
1550 /* Function reduction_code_for_scalar_code
1551
1552    Input:
1553    CODE - tree_code of a reduction operations.
1554
1555    Output:
1556    REDUC_CODE - the corresponding tree-code to be used to reduce the
1557       vector of partial results into a single scalar result (which
1558       will also reside in a vector) or ERROR_MARK if the operation is
1559       a supported reduction operation, but does not have such tree-code.
1560
1561    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1562
1563 static bool
1564 reduction_code_for_scalar_code (enum tree_code code,
1565                                 enum tree_code *reduc_code)
1566 {
1567   switch (code)
1568     {
1569       case MAX_EXPR:
1570         *reduc_code = REDUC_MAX_EXPR;
1571         return true;
1572
1573       case MIN_EXPR:
1574         *reduc_code = REDUC_MIN_EXPR;
1575         return true;
1576
1577       case PLUS_EXPR:
1578         *reduc_code = REDUC_PLUS_EXPR;
1579         return true;
1580
1581       case MULT_EXPR:
1582       case MINUS_EXPR:
1583       case BIT_IOR_EXPR:
1584       case BIT_XOR_EXPR:
1585       case BIT_AND_EXPR:
1586         *reduc_code = ERROR_MARK;
1587         return true;
1588
1589       default:
1590        return false;
1591     }
1592 }
1593
1594
1595 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1596    STMT is printed with a message MSG. */
1597
1598 static void
1599 report_vect_op (gimple stmt, const char *msg)
1600 {
1601   fprintf (vect_dump, "%s", msg);
1602   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1603 }
1604
1605
1606 /* Function vect_is_simple_reduction_1
1607
1608    (1) Detect a cross-iteration def-use cycle that represents a simple
1609    reduction computation. We look for the following pattern:
1610
1611    loop_header:
1612      a1 = phi < a0, a2 >
1613      a3 = ...
1614      a2 = operation (a3, a1)
1615
1616    such that:
1617    1. operation is commutative and associative and it is safe to
1618       change the order of the computation (if CHECK_REDUCTION is true)
1619    2. no uses for a2 in the loop (a2 is used out of the loop)
1620    3. no uses of a1 in the loop besides the reduction operation.
1621
1622    Condition 1 is tested here.
1623    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1624
1625    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1626    nested cycles, if CHECK_REDUCTION is false.
1627
1628    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1629    reductions:
1630
1631      a1 = phi < a0, a2 >
1632      inner loop (def of a3)
1633      a2 = phi < a3 >
1634
1635    If MODIFY is true it tries also to rework the code in-place to enable
1636    detection of more reduction patterns.  For the time being we rewrite
1637    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
1638 */
1639
1640 static gimple
1641 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
1642                             bool check_reduction, bool *double_reduc,
1643                             bool modify)
1644 {
1645   struct loop *loop = (gimple_bb (phi))->loop_father;
1646   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1647   edge latch_e = loop_latch_edge (loop);
1648   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1649   gimple def_stmt, def1 = NULL, def2 = NULL;
1650   enum tree_code orig_code, code;
1651   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1652   tree type;
1653   int nloop_uses;
1654   tree name;
1655   imm_use_iterator imm_iter;
1656   use_operand_p use_p;
1657   bool phi_def;
1658
1659   *double_reduc = false;
1660
1661   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1662      otherwise, we assume outer loop vectorization.  */
1663   gcc_assert ((check_reduction && loop == vect_loop)
1664               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1665
1666   name = PHI_RESULT (phi);
1667   nloop_uses = 0;
1668   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1669     {
1670       gimple use_stmt = USE_STMT (use_p);
1671       if (is_gimple_debug (use_stmt))
1672         continue;
1673       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1674           && vinfo_for_stmt (use_stmt)
1675           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1676         nloop_uses++;
1677       if (nloop_uses > 1)
1678         {
1679           if (vect_print_dump_info (REPORT_DETAILS))
1680             fprintf (vect_dump, "reduction used in loop.");
1681           return NULL;
1682         }
1683     }
1684
1685   if (TREE_CODE (loop_arg) != SSA_NAME)
1686     {
1687       if (vect_print_dump_info (REPORT_DETAILS))
1688         {
1689           fprintf (vect_dump, "reduction: not ssa_name: ");
1690           print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
1691         }
1692       return NULL;
1693     }
1694
1695   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
1696   if (!def_stmt)
1697     {
1698       if (vect_print_dump_info (REPORT_DETAILS))
1699         fprintf (vect_dump, "reduction: no def_stmt.");
1700       return NULL;
1701     }
1702
1703   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
1704     {
1705       if (vect_print_dump_info (REPORT_DETAILS))
1706         print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1707       return NULL;
1708     }
1709
1710   if (is_gimple_assign (def_stmt))
1711     {
1712       name = gimple_assign_lhs (def_stmt);
1713       phi_def = false;
1714     }
1715   else
1716     {
1717       name = PHI_RESULT (def_stmt);
1718       phi_def = true;
1719     }
1720
1721   nloop_uses = 0;
1722   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1723     {
1724       gimple use_stmt = USE_STMT (use_p);
1725       if (is_gimple_debug (use_stmt))
1726         continue;
1727       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1728           && vinfo_for_stmt (use_stmt)
1729           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1730         nloop_uses++;
1731       if (nloop_uses > 1)
1732         {
1733           if (vect_print_dump_info (REPORT_DETAILS))
1734             fprintf (vect_dump, "reduction used in loop.");
1735           return NULL;
1736         }
1737     }
1738
1739   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
1740      defined in the inner loop.  */
1741   if (phi_def)
1742     {
1743       op1 = PHI_ARG_DEF (def_stmt, 0);
1744
1745       if (gimple_phi_num_args (def_stmt) != 1
1746           || TREE_CODE (op1) != SSA_NAME)
1747         {
1748           if (vect_print_dump_info (REPORT_DETAILS))
1749             fprintf (vect_dump, "unsupported phi node definition.");
1750
1751           return NULL;
1752         }
1753
1754       def1 = SSA_NAME_DEF_STMT (op1);
1755       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1756           && loop->inner
1757           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
1758           && is_gimple_assign (def1))
1759         {
1760           if (vect_print_dump_info (REPORT_DETAILS))
1761             report_vect_op (def_stmt, "detected double reduction: ");
1762
1763           *double_reduc = true;
1764           return def_stmt;
1765         }
1766
1767       return NULL;
1768     }
1769
1770   code = orig_code = gimple_assign_rhs_code (def_stmt);
1771
1772   /* We can handle "res -= x[i]", which is non-associative by
1773      simply rewriting this into "res += -x[i]".  Avoid changing
1774      gimple instruction for the first simple tests and only do this
1775      if we're allowed to change code at all.  */
1776   if (code == MINUS_EXPR && modify)
1777     code = PLUS_EXPR;
1778
1779   if (check_reduction
1780       && (!commutative_tree_code (code) || !associative_tree_code (code)))
1781     {
1782       if (vect_print_dump_info (REPORT_DETAILS))
1783         report_vect_op (def_stmt, "reduction: not commutative/associative: ");
1784       return NULL;
1785     }
1786
1787   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
1788     {
1789       if (code != COND_EXPR)
1790         {
1791           if (vect_print_dump_info (REPORT_DETAILS))
1792             report_vect_op (def_stmt, "reduction: not binary operation: ");
1793
1794           return NULL;
1795         }
1796
1797       op3 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
1798       if (COMPARISON_CLASS_P (op3))
1799         {
1800           op4 = TREE_OPERAND (op3, 1);
1801           op3 = TREE_OPERAND (op3, 0);
1802         }
1803
1804       op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1);
1805       op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2);
1806
1807       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
1808         {
1809           if (vect_print_dump_info (REPORT_DETAILS))
1810             report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1811
1812           return NULL;
1813         }
1814     }
1815   else
1816     {
1817       op1 = gimple_assign_rhs1 (def_stmt);
1818       op2 = gimple_assign_rhs2 (def_stmt);
1819
1820       if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
1821         {
1822           if (vect_print_dump_info (REPORT_DETAILS))
1823             report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1824
1825           return NULL;
1826         }
1827    }
1828
1829   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
1830   if ((TREE_CODE (op1) == SSA_NAME
1831        && !types_compatible_p (type,TREE_TYPE (op1)))
1832       || (TREE_CODE (op2) == SSA_NAME
1833           && !types_compatible_p (type, TREE_TYPE (op2)))
1834       || (op3 && TREE_CODE (op3) == SSA_NAME
1835           && !types_compatible_p (type, TREE_TYPE (op3)))
1836       || (op4 && TREE_CODE (op4) == SSA_NAME
1837           && !types_compatible_p (type, TREE_TYPE (op4))))
1838     {
1839       if (vect_print_dump_info (REPORT_DETAILS))
1840         {
1841           fprintf (vect_dump, "reduction: multiple types: operation type: ");
1842           print_generic_expr (vect_dump, type, TDF_SLIM);
1843           fprintf (vect_dump, ", operands types: ");
1844           print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
1845           fprintf (vect_dump, ",");
1846           print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
1847           if (op3)
1848             {
1849               fprintf (vect_dump, ",");
1850               print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
1851             }
1852
1853           if (op4)
1854             {
1855               fprintf (vect_dump, ",");
1856               print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
1857             }
1858         }
1859
1860       return NULL;
1861     }
1862
1863   /* Check that it's ok to change the order of the computation.
1864      Generally, when vectorizing a reduction we change the order of the
1865      computation.  This may change the behavior of the program in some
1866      cases, so we need to check that this is ok.  One exception is when
1867      vectorizing an outer-loop: the inner-loop is executed sequentially,
1868      and therefore vectorizing reductions in the inner-loop during
1869      outer-loop vectorization is safe.  */
1870
1871   /* CHECKME: check for !flag_finite_math_only too?  */
1872   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
1873       && check_reduction)
1874     {
1875       /* Changing the order of operations changes the semantics.  */
1876       if (vect_print_dump_info (REPORT_DETAILS))
1877         report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
1878       return NULL;
1879     }
1880   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
1881            && check_reduction)
1882     {
1883       /* Changing the order of operations changes the semantics.  */
1884       if (vect_print_dump_info (REPORT_DETAILS))
1885         report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
1886       return NULL;
1887     }
1888   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
1889     {
1890       /* Changing the order of operations changes the semantics.  */
1891       if (vect_print_dump_info (REPORT_DETAILS))
1892         report_vect_op (def_stmt,
1893                         "reduction: unsafe fixed-point math optimization: ");
1894       return NULL;
1895     }
1896
1897   /* If we detected "res -= x[i]" earlier, rewrite it into
1898      "res += -x[i]" now.  If this turns out to be useless reassoc
1899      will clean it up again.  */
1900   if (orig_code == MINUS_EXPR)
1901     {
1902       tree rhs = gimple_assign_rhs2 (def_stmt);
1903       tree negrhs = make_ssa_name (SSA_NAME_VAR (rhs), NULL);
1904       gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
1905                                                          rhs, NULL);
1906       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
1907       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
1908                                                           loop_info, NULL));
1909       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
1910       gimple_assign_set_rhs2 (def_stmt, negrhs);
1911       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
1912       update_stmt (def_stmt);
1913     }
1914
1915   /* Reduction is safe. We're dealing with one of the following:
1916      1) integer arithmetic and no trapv
1917      2) floating point arithmetic, and special flags permit this optimization
1918      3) nested cycle (i.e., outer loop vectorization).  */
1919   if (TREE_CODE (op1) == SSA_NAME)
1920     def1 = SSA_NAME_DEF_STMT (op1);
1921
1922   if (TREE_CODE (op2) == SSA_NAME)
1923     def2 = SSA_NAME_DEF_STMT (op2);
1924
1925   if (code != COND_EXPR
1926       && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2)))
1927     {
1928       if (vect_print_dump_info (REPORT_DETAILS))
1929         report_vect_op (def_stmt, "reduction: no defs for operands: ");
1930       return NULL;
1931     }
1932
1933   /* Check that one def is the reduction def, defined by PHI,
1934      the other def is either defined in the loop ("vect_internal_def"),
1935      or it's an induction (defined by a loop-header phi-node).  */
1936
1937   if (def2 && def2 == phi
1938       && (code == COND_EXPR
1939           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
1940               && (is_gimple_assign (def1)
1941                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1942                       == vect_induction_def
1943                   || (gimple_code (def1) == GIMPLE_PHI
1944                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1945                           == vect_internal_def
1946                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
1947     {
1948       if (vect_print_dump_info (REPORT_DETAILS))
1949         report_vect_op (def_stmt, "detected reduction: ");
1950       return def_stmt;
1951     }
1952   else if (def1 && def1 == phi
1953            && (code == COND_EXPR
1954                || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
1955                    && (is_gimple_assign (def2)
1956                        || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1957                            == vect_induction_def
1958                        || (gimple_code (def2) == GIMPLE_PHI
1959                            && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1960                                == vect_internal_def
1961                            && !is_loop_header_bb_p (gimple_bb (def2)))))))
1962     {
1963       if (check_reduction)
1964         {
1965           /* Swap operands (just for simplicity - so that the rest of the code
1966              can assume that the reduction variable is always the last (second)
1967              argument).  */
1968           if (vect_print_dump_info (REPORT_DETAILS))
1969             report_vect_op (def_stmt,
1970                             "detected reduction: need to swap operands: ");
1971
1972           swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
1973                               gimple_assign_rhs2_ptr (def_stmt));
1974         }
1975       else
1976         {
1977           if (vect_print_dump_info (REPORT_DETAILS))
1978             report_vect_op (def_stmt, "detected reduction: ");
1979         }
1980
1981       return def_stmt;
1982     }
1983   else
1984     {
1985       if (vect_print_dump_info (REPORT_DETAILS))
1986         report_vect_op (def_stmt, "reduction: unknown pattern: ");
1987
1988       return NULL;
1989     }
1990 }
1991
1992 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
1993    in-place.  Arguments as there.  */
1994
1995 static gimple
1996 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
1997                           bool check_reduction, bool *double_reduc)
1998 {
1999   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2000                                      double_reduc, false);
2001 }
2002
2003 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2004    in-place if it enables detection of more reductions.  Arguments
2005    as there.  */
2006
2007 gimple
2008 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2009                           bool check_reduction, bool *double_reduc)
2010 {
2011   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2012                                      double_reduc, true);
2013 }
2014
2015 /* Calculate the cost of one scalar iteration of the loop.  */
2016 int
2017 vect_get_single_scalar_iteraion_cost (loop_vec_info loop_vinfo)
2018 {
2019   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2020   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2021   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2022   int innerloop_iters, i, stmt_cost;
2023
2024   /* Count statements in scalar loop. Using this as scalar cost for a single
2025      iteration for now.
2026
2027      TODO: Add outer loop support.
2028
2029      TODO: Consider assigning different costs to different scalar
2030      statements.  */
2031
2032   /* FORNOW.  */
2033   if (loop->inner)
2034     innerloop_iters = 50; /* FIXME */
2035
2036   for (i = 0; i < nbbs; i++)
2037     {
2038       gimple_stmt_iterator si;
2039       basic_block bb = bbs[i];
2040
2041       if (bb->loop_father == loop->inner)
2042         factor = innerloop_iters;
2043       else
2044         factor = 1;
2045
2046       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2047         {
2048           gimple stmt = gsi_stmt (si);
2049           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2050
2051           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2052             continue;
2053
2054           /* Skip stmts that are not vectorized inside the loop.  */
2055           if (stmt_info
2056               && !STMT_VINFO_RELEVANT_P (stmt_info)
2057               && (!STMT_VINFO_LIVE_P (stmt_info)
2058                   || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2059             continue;
2060
2061           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2062             {
2063               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2064                stmt_cost = vect_get_cost (scalar_load);
2065              else
2066                stmt_cost = vect_get_cost (scalar_store);
2067             }
2068           else
2069             stmt_cost = vect_get_cost (scalar_stmt);
2070
2071           scalar_single_iter_cost += stmt_cost * factor;
2072         }
2073     }
2074   return scalar_single_iter_cost;
2075 }
2076
2077 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2078 int
2079 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2080                              int *peel_iters_epilogue,
2081                              int scalar_single_iter_cost)
2082 {
2083   int peel_guard_costs = 0;
2084   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2085
2086   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2087     {
2088       *peel_iters_epilogue = vf/2;
2089       if (vect_print_dump_info (REPORT_COST))
2090         fprintf (vect_dump, "cost model: "
2091                             "epilogue peel iters set to vf/2 because "
2092                             "loop iterations are unknown .");
2093
2094       /* If peeled iterations are known but number of scalar loop
2095          iterations are unknown, count a taken branch per peeled loop.  */
2096       peel_guard_costs =  2 * vect_get_cost (cond_branch_taken);
2097     }
2098   else
2099     {
2100       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2101       peel_iters_prologue = niters < peel_iters_prologue ?
2102                             niters : peel_iters_prologue;
2103       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2104     }
2105
2106    return (peel_iters_prologue * scalar_single_iter_cost)
2107             + (*peel_iters_epilogue * scalar_single_iter_cost)
2108            + peel_guard_costs;
2109 }
2110
2111 /* Function vect_estimate_min_profitable_iters
2112
2113    Return the number of iterations required for the vector version of the
2114    loop to be profitable relative to the cost of the scalar version of the
2115    loop.
2116
2117    TODO: Take profile info into account before making vectorization
2118    decisions, if available.  */
2119
2120 int
2121 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
2122 {
2123   int i;
2124   int min_profitable_iters;
2125   int peel_iters_prologue;
2126   int peel_iters_epilogue;
2127   int vec_inside_cost = 0;
2128   int vec_outside_cost = 0;
2129   int scalar_single_iter_cost = 0;
2130   int scalar_outside_cost = 0;
2131   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2132   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2133   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2134   int nbbs = loop->num_nodes;
2135   int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2136   int peel_guard_costs = 0;
2137   int innerloop_iters = 0, factor;
2138   VEC (slp_instance, heap) *slp_instances;
2139   slp_instance instance;
2140
2141   /* Cost model disabled.  */
2142   if (!flag_vect_cost_model)
2143     {
2144       if (vect_print_dump_info (REPORT_COST))
2145         fprintf (vect_dump, "cost model disabled.");
2146       return 0;
2147     }
2148
2149   /* Requires loop versioning tests to handle misalignment.  */
2150   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2151     {
2152       /*  FIXME: Make cost depend on complexity of individual check.  */
2153       vec_outside_cost +=
2154         VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
2155       if (vect_print_dump_info (REPORT_COST))
2156         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2157                  "versioning to treat misalignment.\n");
2158     }
2159
2160   /* Requires loop versioning with alias checks.  */
2161   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2162     {
2163       /*  FIXME: Make cost depend on complexity of individual check.  */
2164       vec_outside_cost +=
2165         VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
2166       if (vect_print_dump_info (REPORT_COST))
2167         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2168                  "versioning aliasing.\n");
2169     }
2170
2171   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2172       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2173     vec_outside_cost += vect_get_cost (cond_branch_taken);
2174
2175   /* Count statements in scalar loop.  Using this as scalar cost for a single
2176      iteration for now.
2177
2178      TODO: Add outer loop support.
2179
2180      TODO: Consider assigning different costs to different scalar
2181      statements.  */
2182
2183   /* FORNOW.  */
2184   if (loop->inner)
2185     innerloop_iters = 50; /* FIXME */
2186
2187   for (i = 0; i < nbbs; i++)
2188     {
2189       gimple_stmt_iterator si;
2190       basic_block bb = bbs[i];
2191
2192       if (bb->loop_father == loop->inner)
2193         factor = innerloop_iters;
2194       else
2195         factor = 1;
2196
2197       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2198         {
2199           gimple stmt = gsi_stmt (si);
2200           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2201           /* Skip stmts that are not vectorized inside the loop.  */
2202           if (!STMT_VINFO_RELEVANT_P (stmt_info)
2203               && (!STMT_VINFO_LIVE_P (stmt_info)
2204                   || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2205             continue;
2206           vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
2207           /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
2208              some of the "outside" costs are generated inside the outer-loop.  */
2209           vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
2210         }
2211     }
2212
2213   scalar_single_iter_cost = vect_get_single_scalar_iteraion_cost (loop_vinfo);
2214
2215   /* Add additional cost for the peeled instructions in prologue and epilogue
2216      loop.
2217
2218      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2219      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2220
2221      TODO: Build an expression that represents peel_iters for prologue and
2222      epilogue to be used in a run-time test.  */
2223
2224   if (npeel  < 0)
2225     {
2226       peel_iters_prologue = vf/2;
2227       if (vect_print_dump_info (REPORT_COST))
2228         fprintf (vect_dump, "cost model: "
2229                  "prologue peel iters set to vf/2.");
2230
2231       /* If peeling for alignment is unknown, loop bound of main loop becomes
2232          unknown.  */
2233       peel_iters_epilogue = vf/2;
2234       if (vect_print_dump_info (REPORT_COST))
2235         fprintf (vect_dump, "cost model: "
2236                  "epilogue peel iters set to vf/2 because "
2237                  "peeling for alignment is unknown .");
2238
2239       /* If peeled iterations are unknown, count a taken branch and a not taken
2240          branch per peeled loop. Even if scalar loop iterations are known,
2241          vector iterations are not known since peeled prologue iterations are
2242          not known. Hence guards remain the same.  */
2243       peel_guard_costs +=  2 * (vect_get_cost (cond_branch_taken)
2244                                 + vect_get_cost (cond_branch_not_taken));
2245       vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2246                            + (peel_iters_epilogue * scalar_single_iter_cost)
2247                            + peel_guard_costs;
2248     }
2249   else
2250     {
2251       peel_iters_prologue = npeel;
2252       vec_outside_cost += vect_get_known_peeling_cost (loop_vinfo,
2253                                     peel_iters_prologue, &peel_iters_epilogue,
2254                                     scalar_single_iter_cost);
2255     }
2256
2257   /* FORNOW: The scalar outside cost is incremented in one of the
2258      following ways:
2259
2260      1. The vectorizer checks for alignment and aliasing and generates
2261      a condition that allows dynamic vectorization.  A cost model
2262      check is ANDED with the versioning condition.  Hence scalar code
2263      path now has the added cost of the versioning check.
2264
2265        if (cost > th & versioning_check)
2266          jmp to vector code
2267
2268      Hence run-time scalar is incremented by not-taken branch cost.
2269
2270      2. The vectorizer then checks if a prologue is required.  If the
2271      cost model check was not done before during versioning, it has to
2272      be done before the prologue check.
2273
2274        if (cost <= th)
2275          prologue = scalar_iters
2276        if (prologue == 0)
2277          jmp to vector code
2278        else
2279          execute prologue
2280        if (prologue == num_iters)
2281          go to exit
2282
2283      Hence the run-time scalar cost is incremented by a taken branch,
2284      plus a not-taken branch, plus a taken branch cost.
2285
2286      3. The vectorizer then checks if an epilogue is required.  If the
2287      cost model check was not done before during prologue check, it
2288      has to be done with the epilogue check.
2289
2290        if (prologue == 0)
2291          jmp to vector code
2292        else
2293          execute prologue
2294        if (prologue == num_iters)
2295          go to exit
2296        vector code:
2297          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2298            jmp to epilogue
2299
2300      Hence the run-time scalar cost should be incremented by 2 taken
2301      branches.
2302
2303      TODO: The back end may reorder the BBS's differently and reverse
2304      conditions/branch directions.  Change the estimates below to
2305      something more reasonable.  */
2306
2307   /* If the number of iterations is known and we do not do versioning, we can
2308      decide whether to vectorize at compile time. Hence the scalar version
2309      do not carry cost model guard costs.  */
2310   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2311       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2312       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2313     {
2314       /* Cost model check occurs at versioning.  */
2315       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2316           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2317         scalar_outside_cost += vect_get_cost (cond_branch_not_taken);
2318       else
2319         {
2320           /* Cost model check occurs at prologue generation.  */
2321           if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2322             scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken)
2323                                    + vect_get_cost (cond_branch_not_taken);
2324           /* Cost model check occurs at epilogue generation.  */
2325           else
2326             scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken);
2327         }
2328     }
2329
2330   /* Add SLP costs.  */
2331   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2332   for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
2333     {
2334       vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2335       vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2336     }
2337
2338   /* Calculate number of iterations required to make the vector version
2339      profitable, relative to the loop bodies only. The following condition
2340      must hold true:
2341      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2342      where
2343      SIC = scalar iteration cost, VIC = vector iteration cost,
2344      VOC = vector outside cost, VF = vectorization factor,
2345      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2346      SOC = scalar outside cost for run time cost model check.  */
2347
2348   if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2349     {
2350       if (vec_outside_cost <= 0)
2351         min_profitable_iters = 1;
2352       else
2353         {
2354           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2355                                   - vec_inside_cost * peel_iters_prologue
2356                                   - vec_inside_cost * peel_iters_epilogue)
2357                                  / ((scalar_single_iter_cost * vf)
2358                                     - vec_inside_cost);
2359
2360           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2361               <= ((vec_inside_cost * min_profitable_iters)
2362                   + ((vec_outside_cost - scalar_outside_cost) * vf)))
2363             min_profitable_iters++;
2364         }
2365     }
2366   /* vector version will never be profitable.  */
2367   else
2368     {
2369       if (vect_print_dump_info (REPORT_COST))
2370         fprintf (vect_dump, "cost model: the vector iteration cost = %d "
2371                  "divided by the scalar iteration cost = %d "
2372                  "is greater or equal to the vectorization factor = %d.",
2373                  vec_inside_cost, scalar_single_iter_cost, vf);
2374       return -1;
2375     }
2376
2377   if (vect_print_dump_info (REPORT_COST))
2378     {
2379       fprintf (vect_dump, "Cost model analysis: \n");
2380       fprintf (vect_dump, "  Vector inside of loop cost: %d\n",
2381                vec_inside_cost);
2382       fprintf (vect_dump, "  Vector outside of loop cost: %d\n",
2383                vec_outside_cost);
2384       fprintf (vect_dump, "  Scalar iteration cost: %d\n",
2385                scalar_single_iter_cost);
2386       fprintf (vect_dump, "  Scalar outside cost: %d\n", scalar_outside_cost);
2387       fprintf (vect_dump, "  prologue iterations: %d\n",
2388                peel_iters_prologue);
2389       fprintf (vect_dump, "  epilogue iterations: %d\n",
2390                peel_iters_epilogue);
2391       fprintf (vect_dump, "  Calculated minimum iters for profitability: %d\n",
2392                min_profitable_iters);
2393     }
2394
2395   min_profitable_iters =
2396         min_profitable_iters < vf ? vf : min_profitable_iters;
2397
2398   /* Because the condition we create is:
2399      if (niters <= min_profitable_iters)
2400        then skip the vectorized loop.  */
2401   min_profitable_iters--;
2402
2403   if (vect_print_dump_info (REPORT_COST))
2404     fprintf (vect_dump, "  Profitability threshold = %d\n",
2405              min_profitable_iters);
2406
2407   return min_profitable_iters;
2408 }
2409
2410
2411 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2412    functions. Design better to avoid maintenance issues.  */
2413
2414 /* Function vect_model_reduction_cost.
2415
2416    Models cost for a reduction operation, including the vector ops
2417    generated within the strip-mine loop, the initial definition before
2418    the loop, and the epilogue code that must be generated.  */
2419
2420 static bool
2421 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2422                            int ncopies)
2423 {
2424   int outer_cost = 0;
2425   enum tree_code code;
2426   optab optab;
2427   tree vectype;
2428   gimple stmt, orig_stmt;
2429   tree reduction_op;
2430   enum machine_mode mode;
2431   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2432   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2433
2434
2435   /* Cost of reduction op inside loop.  */
2436   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2437     += ncopies * vect_get_cost (vector_stmt);
2438
2439   stmt = STMT_VINFO_STMT (stmt_info);
2440
2441   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2442     {
2443     case GIMPLE_SINGLE_RHS:
2444       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2445       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2446       break;
2447     case GIMPLE_UNARY_RHS:
2448       reduction_op = gimple_assign_rhs1 (stmt);
2449       break;
2450     case GIMPLE_BINARY_RHS:
2451       reduction_op = gimple_assign_rhs2 (stmt);
2452       break;
2453     default:
2454       gcc_unreachable ();
2455     }
2456
2457   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2458   if (!vectype)
2459     {
2460       if (vect_print_dump_info (REPORT_COST))
2461         {
2462           fprintf (vect_dump, "unsupported data-type ");
2463           print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2464         }
2465       return false;
2466    }
2467
2468   mode = TYPE_MODE (vectype);
2469   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2470
2471   if (!orig_stmt)
2472     orig_stmt = STMT_VINFO_STMT (stmt_info);
2473
2474   code = gimple_assign_rhs_code (orig_stmt);
2475
2476   /* Add in cost for initial definition.  */
2477   outer_cost += vect_get_cost (scalar_to_vec);
2478
2479   /* Determine cost of epilogue code.
2480
2481      We have a reduction operator that will reduce the vector in one statement.
2482      Also requires scalar extract.  */
2483
2484   if (!nested_in_vect_loop_p (loop, orig_stmt))
2485     {
2486       if (reduc_code != ERROR_MARK)
2487         outer_cost += vect_get_cost (vector_stmt)
2488                       + vect_get_cost (vec_to_scalar);
2489       else
2490         {
2491           int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2492           tree bitsize =
2493             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2494           int element_bitsize = tree_low_cst (bitsize, 1);
2495           int nelements = vec_size_in_bits / element_bitsize;
2496
2497           optab = optab_for_tree_code (code, vectype, optab_default);
2498
2499           /* We have a whole vector shift available.  */
2500           if (VECTOR_MODE_P (mode)
2501               && optab_handler (optab, mode) != CODE_FOR_nothing
2502               && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
2503             /* Final reduction via vector shifts and the reduction operator. Also
2504                requires scalar extract.  */
2505             outer_cost += ((exact_log2(nelements) * 2)
2506               * vect_get_cost (vector_stmt)
2507               + vect_get_cost (vec_to_scalar));
2508           else
2509             /* Use extracts and reduction op for final reduction.  For N elements,
2510                we have N extracts and N-1 reduction ops.  */
2511             outer_cost += ((nelements + nelements - 1)
2512               * vect_get_cost (vector_stmt));
2513         }
2514     }
2515
2516   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2517
2518   if (vect_print_dump_info (REPORT_COST))
2519     fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2520              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2521              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2522
2523   return true;
2524 }
2525
2526
2527 /* Function vect_model_induction_cost.
2528
2529    Models cost for induction operations.  */
2530
2531 static void
2532 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2533 {
2534   /* loop cost for vec_loop.  */
2535   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2536     = ncopies * vect_get_cost (vector_stmt);
2537   /* prologue cost for vec_init and vec_step.  */
2538   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)
2539     = 2 * vect_get_cost (scalar_to_vec);
2540
2541   if (vect_print_dump_info (REPORT_COST))
2542     fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2543              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2544              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2545 }
2546
2547
2548 /* Function get_initial_def_for_induction
2549
2550    Input:
2551    STMT - a stmt that performs an induction operation in the loop.
2552    IV_PHI - the initial value of the induction variable
2553
2554    Output:
2555    Return a vector variable, initialized with the first VF values of
2556    the induction variable. E.g., for an iv with IV_PHI='X' and
2557    evolution S, for a vector of 4 units, we want to return:
2558    [X, X + S, X + 2*S, X + 3*S].  */
2559
2560 static tree
2561 get_initial_def_for_induction (gimple iv_phi)
2562 {
2563   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2564   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2565   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2566   tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
2567   tree vectype;
2568   int nunits;
2569   edge pe = loop_preheader_edge (loop);
2570   struct loop *iv_loop;
2571   basic_block new_bb;
2572   tree vec, vec_init, vec_step, t;
2573   tree access_fn;
2574   tree new_var;
2575   tree new_name;
2576   gimple init_stmt, induction_phi, new_stmt;
2577   tree induc_def, vec_def, vec_dest;
2578   tree init_expr, step_expr;
2579   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2580   int i;
2581   bool ok;
2582   int ncopies;
2583   tree expr;
2584   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2585   bool nested_in_vect_loop = false;
2586   gimple_seq stmts = NULL;
2587   imm_use_iterator imm_iter;
2588   use_operand_p use_p;
2589   gimple exit_phi;
2590   edge latch_e;
2591   tree loop_arg;
2592   gimple_stmt_iterator si;
2593   basic_block bb = gimple_bb (iv_phi);
2594   tree stepvectype;
2595
2596   vectype = get_vectype_for_scalar_type (scalar_type);
2597   gcc_assert (vectype);
2598   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2599   ncopies = vf / nunits;
2600
2601   gcc_assert (phi_info);
2602   gcc_assert (ncopies >= 1);
2603
2604   /* Find the first insertion point in the BB.  */
2605   si = gsi_after_labels (bb);
2606
2607   if (INTEGRAL_TYPE_P (scalar_type))
2608     step_expr = build_int_cst (scalar_type, 0);
2609   else if (POINTER_TYPE_P (scalar_type))
2610     step_expr = build_int_cst (sizetype, 0);
2611   else
2612     step_expr = build_real (scalar_type, dconst0);
2613
2614   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
2615   if (nested_in_vect_loop_p (loop, iv_phi))
2616     {
2617       nested_in_vect_loop = true;
2618       iv_loop = loop->inner;
2619     }
2620   else
2621     iv_loop = loop;
2622   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2623
2624   latch_e = loop_latch_edge (iv_loop);
2625   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2626
2627   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2628   gcc_assert (access_fn);
2629   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2630                                     &init_expr, &step_expr);
2631   gcc_assert (ok);
2632   pe = loop_preheader_edge (iv_loop);
2633
2634   /* Create the vector that holds the initial_value of the induction.  */
2635   if (nested_in_vect_loop)
2636     {
2637       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
2638          been created during vectorization of previous stmts; We obtain it from
2639          the STMT_VINFO_VEC_STMT of the defining stmt. */
2640       tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2641                                            loop_preheader_edge (iv_loop));
2642       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2643     }
2644   else
2645     {
2646       /* iv_loop is the loop to be vectorized. Create:
2647          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
2648       new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2649       add_referenced_var (new_var);
2650
2651       new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2652       if (stmts)
2653         {
2654           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2655           gcc_assert (!new_bb);
2656         }
2657
2658       t = NULL_TREE;
2659       t = tree_cons (NULL_TREE, init_expr, t);
2660       for (i = 1; i < nunits; i++)
2661         {
2662           /* Create: new_name_i = new_name + step_expr  */
2663           enum tree_code code = POINTER_TYPE_P (scalar_type)
2664                                 ? POINTER_PLUS_EXPR : PLUS_EXPR;
2665           init_stmt = gimple_build_assign_with_ops (code, new_var,
2666                                                     new_name, step_expr);
2667           new_name = make_ssa_name (new_var, init_stmt);
2668           gimple_assign_set_lhs (init_stmt, new_name);
2669
2670           new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
2671           gcc_assert (!new_bb);
2672
2673           if (vect_print_dump_info (REPORT_DETAILS))
2674             {
2675               fprintf (vect_dump, "created new init_stmt: ");
2676               print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
2677             }
2678           t = tree_cons (NULL_TREE, new_name, t);
2679         }
2680       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
2681       vec = build_constructor_from_list (vectype, nreverse (t));
2682       vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
2683     }
2684
2685
2686   /* Create the vector that holds the step of the induction.  */
2687   if (nested_in_vect_loop)
2688     /* iv_loop is nested in the loop to be vectorized. Generate:
2689        vec_step = [S, S, S, S]  */
2690     new_name = step_expr;
2691   else
2692     {
2693       /* iv_loop is the loop to be vectorized. Generate:
2694           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
2695       expr = build_int_cst (TREE_TYPE (step_expr), vf);
2696       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2697                               expr, step_expr);
2698     }
2699
2700   t = NULL_TREE;
2701   for (i = 0; i < nunits; i++)
2702     t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2703   gcc_assert (CONSTANT_CLASS_P (new_name));
2704   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
2705   gcc_assert (stepvectype);
2706   vec = build_vector (stepvectype, t);
2707   vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2708
2709
2710   /* Create the following def-use cycle:
2711      loop prolog:
2712          vec_init = ...
2713          vec_step = ...
2714      loop:
2715          vec_iv = PHI <vec_init, vec_loop>
2716          ...
2717          STMT
2718          ...
2719          vec_loop = vec_iv + vec_step;  */
2720
2721   /* Create the induction-phi that defines the induction-operand.  */
2722   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
2723   add_referenced_var (vec_dest);
2724   induction_phi = create_phi_node (vec_dest, iv_loop->header);
2725   set_vinfo_for_stmt (induction_phi,
2726                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
2727   induc_def = PHI_RESULT (induction_phi);
2728
2729   /* Create the iv update inside the loop  */
2730   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2731                                            induc_def, vec_step);
2732   vec_def = make_ssa_name (vec_dest, new_stmt);
2733   gimple_assign_set_lhs (new_stmt, vec_def);
2734   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2735   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
2736                                                    NULL));
2737
2738   /* Set the arguments of the phi node:  */
2739   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
2740   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
2741                UNKNOWN_LOCATION);
2742
2743
2744   /* In case that vectorization factor (VF) is bigger than the number
2745      of elements that we can fit in a vectype (nunits), we have to generate
2746      more than one vector stmt - i.e - we need to "unroll" the
2747      vector stmt by a factor VF/nunits.  For more details see documentation
2748      in vectorizable_operation.  */
2749
2750   if (ncopies > 1)
2751     {
2752       stmt_vec_info prev_stmt_vinfo;
2753       /* FORNOW. This restriction should be relaxed.  */
2754       gcc_assert (!nested_in_vect_loop);
2755
2756       /* Create the vector that holds the step of the induction.  */
2757       expr = build_int_cst (TREE_TYPE (step_expr), nunits);
2758       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2759                               expr, step_expr);
2760       t = NULL_TREE;
2761       for (i = 0; i < nunits; i++)
2762         t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2763       gcc_assert (CONSTANT_CLASS_P (new_name));
2764       vec = build_vector (stepvectype, t);
2765       vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2766
2767       vec_def = induc_def;
2768       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
2769       for (i = 1; i < ncopies; i++)
2770         {
2771           /* vec_i = vec_prev + vec_step  */
2772           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2773                                                    vec_def, vec_step);
2774           vec_def = make_ssa_name (vec_dest, new_stmt);
2775           gimple_assign_set_lhs (new_stmt, vec_def);
2776
2777           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2778           set_vinfo_for_stmt (new_stmt,
2779                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2780           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
2781           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
2782         }
2783     }
2784
2785   if (nested_in_vect_loop)
2786     {
2787       /* Find the loop-closed exit-phi of the induction, and record
2788          the final vector of induction results:  */
2789       exit_phi = NULL;
2790       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
2791         {
2792           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
2793             {
2794               exit_phi = USE_STMT (use_p);
2795               break;
2796             }
2797         }
2798       if (exit_phi)
2799         {
2800           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2801           /* FORNOW. Currently not supporting the case that an inner-loop induction
2802              is not used in the outer-loop (i.e. only outside the outer-loop).  */
2803           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2804                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
2805
2806           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
2807           if (vect_print_dump_info (REPORT_DETAILS))
2808             {
2809               fprintf (vect_dump, "vector of inductions after inner-loop:");
2810               print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
2811             }
2812         }
2813     }
2814
2815
2816   if (vect_print_dump_info (REPORT_DETAILS))
2817     {
2818       fprintf (vect_dump, "transform induction: created def-use cycle: ");
2819       print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
2820       fprintf (vect_dump, "\n");
2821       print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
2822     }
2823
2824   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
2825   return induc_def;
2826 }
2827
2828
2829 /* Function get_initial_def_for_reduction
2830
2831    Input:
2832    STMT - a stmt that performs a reduction operation in the loop.
2833    INIT_VAL - the initial value of the reduction variable
2834
2835    Output:
2836    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2837         of the reduction (used for adjusting the epilog - see below).
2838    Return a vector variable, initialized according to the operation that STMT
2839         performs. This vector will be used as the initial value of the
2840         vector of partial results.
2841
2842    Option1 (adjust in epilog): Initialize the vector as follows:
2843      add/bit or/xor:    [0,0,...,0,0]
2844      mult/bit and:      [1,1,...,1,1]
2845      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
2846    and when necessary (e.g. add/mult case) let the caller know
2847    that it needs to adjust the result by init_val.
2848
2849    Option2: Initialize the vector as follows:
2850      add/bit or/xor:    [init_val,0,0,...,0]
2851      mult/bit and:      [init_val,1,1,...,1]
2852      min/max/cond_expr: [init_val,init_val,...,init_val]
2853    and no adjustments are needed.
2854
2855    For example, for the following code:
2856
2857    s = init_val;
2858    for (i=0;i<n;i++)
2859      s = s + a[i];
2860
2861    STMT is 's = s + a[i]', and the reduction variable is 's'.
2862    For a vector of 4 units, we want to return either [0,0,0,init_val],
2863    or [0,0,0,0] and let the caller know that it needs to adjust
2864    the result at the end by 'init_val'.
2865
2866    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2867    initialization vector is simpler (same element in all entries), if
2868    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
2869
2870    A cost model should help decide between these two schemes.  */
2871
2872 tree
2873 get_initial_def_for_reduction (gimple stmt, tree init_val,
2874                                tree *adjustment_def)
2875 {
2876   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2877   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2878   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2879   tree scalar_type = TREE_TYPE (init_val);
2880   tree vectype = get_vectype_for_scalar_type (scalar_type);
2881   int nunits;
2882   enum tree_code code = gimple_assign_rhs_code (stmt);
2883   tree def_for_init;
2884   tree init_def;
2885   tree t = NULL_TREE;
2886   int i;
2887   bool nested_in_vect_loop = false;
2888   tree init_value;
2889   REAL_VALUE_TYPE real_init_val = dconst0;
2890   int int_init_val = 0;
2891   gimple def_stmt = NULL;
2892
2893   gcc_assert (vectype);
2894   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2895
2896   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
2897               || SCALAR_FLOAT_TYPE_P (scalar_type));
2898
2899   if (nested_in_vect_loop_p (loop, stmt))
2900     nested_in_vect_loop = true;
2901   else
2902     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2903
2904   /* In case of double reduction we only create a vector variable to be put
2905      in the reduction phi node. The actual statement creation is done in
2906      vect_create_epilog_for_reduction.  */
2907   if (adjustment_def && nested_in_vect_loop
2908       && TREE_CODE (init_val) == SSA_NAME
2909       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
2910       && gimple_code (def_stmt) == GIMPLE_PHI
2911       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2912       && vinfo_for_stmt (def_stmt)
2913       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2914           == vect_double_reduction_def)
2915     {
2916       *adjustment_def = NULL;
2917       return vect_create_destination_var (init_val, vectype);
2918     }
2919
2920   if (TREE_CONSTANT (init_val))
2921     {
2922       if (SCALAR_FLOAT_TYPE_P (scalar_type))
2923         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
2924       else
2925         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
2926     }
2927   else
2928     init_value = init_val;
2929
2930   switch (code)
2931     {
2932       case WIDEN_SUM_EXPR:
2933       case DOT_PROD_EXPR:
2934       case PLUS_EXPR:
2935       case MINUS_EXPR:
2936       case BIT_IOR_EXPR:
2937       case BIT_XOR_EXPR:
2938       case MULT_EXPR:
2939       case BIT_AND_EXPR:
2940         /* ADJUSMENT_DEF is NULL when called from
2941            vect_create_epilog_for_reduction to vectorize double reduction.  */
2942         if (adjustment_def)
2943           {
2944             if (nested_in_vect_loop)
2945               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
2946                                                               NULL);
2947             else
2948               *adjustment_def = init_val;
2949           }
2950
2951         if (code == MULT_EXPR)
2952           {
2953             real_init_val = dconst1;
2954             int_init_val = 1;
2955           }
2956
2957         if (code == BIT_AND_EXPR)
2958           int_init_val = -1;
2959
2960         if (SCALAR_FLOAT_TYPE_P (scalar_type))
2961           def_for_init = build_real (scalar_type, real_init_val);
2962         else
2963           def_for_init = build_int_cst (scalar_type, int_init_val);
2964
2965         /* Create a vector of '0' or '1' except the first element.  */
2966         for (i = nunits - 2; i >= 0; --i)
2967           t = tree_cons (NULL_TREE, def_for_init, t);
2968
2969         /* Option1: the first element is '0' or '1' as well.  */
2970         if (adjustment_def)
2971           {
2972             t = tree_cons (NULL_TREE, def_for_init, t);
2973             init_def = build_vector (vectype, t);
2974             break;
2975           }
2976
2977         /* Option2: the first element is INIT_VAL.  */
2978         t = tree_cons (NULL_TREE, init_value, t);
2979         if (TREE_CONSTANT (init_val))
2980           init_def = build_vector (vectype, t);
2981         else
2982           init_def = build_constructor_from_list (vectype, t);
2983
2984         break;
2985
2986       case MIN_EXPR:
2987       case MAX_EXPR:
2988       case COND_EXPR:
2989         if (adjustment_def)
2990           {
2991             *adjustment_def = NULL_TREE;
2992             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2993             break;
2994           }
2995
2996         for (i = nunits - 1; i >= 0; --i)
2997           t = tree_cons (NULL_TREE, init_value, t);
2998
2999         if (TREE_CONSTANT (init_val))
3000           init_def = build_vector (vectype, t);
3001         else
3002           init_def = build_constructor_from_list (vectype, t);
3003
3004         break;
3005
3006       default:
3007         gcc_unreachable ();
3008     }
3009
3010   return init_def;
3011 }
3012
3013
3014 /* Function vect_create_epilog_for_reduction
3015
3016    Create code at the loop-epilog to finalize the result of a reduction
3017    computation.
3018
3019    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3020      reduction statements.
3021    STMT is the scalar reduction stmt that is being vectorized.
3022    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3023      number of elements that we can fit in a vectype (nunits). In this case
3024      we have to generate more than one vector stmt - i.e - we need to "unroll"
3025      the vector stmt by a factor VF/nunits.  For more details see documentation
3026      in vectorizable_operation.
3027    REDUC_CODE is the tree-code for the epilog reduction.
3028    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3029      computation.
3030    REDUC_INDEX is the index of the operand in the right hand side of the
3031      statement that is defined by REDUCTION_PHI.
3032    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3033    SLP_NODE is an SLP node containing a group of reduction statements. The
3034      first one in this group is STMT.
3035
3036    This function:
3037    1. Creates the reduction def-use cycles: sets the arguments for
3038       REDUCTION_PHIS:
3039       The loop-entry argument is the vectorized initial-value of the reduction.
3040       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3041       sums.
3042    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3043       by applying the operation specified by REDUC_CODE if available, or by
3044       other means (whole-vector shifts or a scalar loop).
3045       The function also creates a new phi node at the loop exit to preserve
3046       loop-closed form, as illustrated below.
3047
3048      The flow at the entry to this function:
3049
3050         loop:
3051           vec_def = phi <null, null>            # REDUCTION_PHI
3052           VECT_DEF = vector_stmt                # vectorized form of STMT
3053           s_loop = scalar_stmt                  # (scalar) STMT
3054         loop_exit:
3055           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3056           use <s_out0>
3057           use <s_out0>
3058
3059      The above is transformed by this function into:
3060
3061         loop:
3062           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3063           VECT_DEF = vector_stmt                # vectorized form of STMT
3064           s_loop = scalar_stmt                  # (scalar) STMT
3065         loop_exit:
3066           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3067           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3068           v_out2 = reduce <v_out1>
3069           s_out3 = extract_field <v_out2, 0>
3070           s_out4 = adjust_result <s_out3>
3071           use <s_out4>
3072           use <s_out4>
3073 */
3074
3075 static void
3076 vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
3077                                   int ncopies, enum tree_code reduc_code,
3078                                   VEC (gimple, heap) *reduction_phis,
3079                                   int reduc_index, bool double_reduc,
3080                                   slp_tree slp_node)
3081 {
3082   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3083   stmt_vec_info prev_phi_info;
3084   tree vectype;
3085   enum machine_mode mode;
3086   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3087   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3088   basic_block exit_bb;
3089   tree scalar_dest;
3090   tree scalar_type;
3091   gimple new_phi = NULL, phi;
3092   gimple_stmt_iterator exit_gsi;
3093   tree vec_dest;
3094   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3095   gimple epilog_stmt = NULL;
3096   enum tree_code code = gimple_assign_rhs_code (stmt);
3097   gimple exit_phi;
3098   tree bitsize, bitpos;
3099   tree adjustment_def = NULL;
3100   tree vec_initial_def = NULL;
3101   tree reduction_op, expr, def;
3102   tree orig_name, scalar_result;
3103   imm_use_iterator imm_iter;
3104   use_operand_p use_p;
3105   bool extract_scalar_result = false;
3106   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3107   bool nested_in_vect_loop = false;
3108   VEC (gimple, heap) *new_phis = NULL;
3109   enum vect_def_type dt = vect_unknown_def_type;
3110   int j, i;
3111   VEC (tree, heap) *scalar_results = NULL;
3112   unsigned int group_size = 1, k, ratio;
3113   VEC (tree, heap) *vec_initial_defs = NULL;
3114   VEC (gimple, heap) *phis;
3115
3116   if (slp_node)
3117     group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
3118
3119   if (nested_in_vect_loop_p (loop, stmt))
3120     {
3121       outer_loop = loop;
3122       loop = loop->inner;
3123       nested_in_vect_loop = true;
3124       gcc_assert (!slp_node);
3125     }
3126
3127   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3128     {
3129     case GIMPLE_SINGLE_RHS:
3130       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3131                                        == ternary_op);
3132       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3133       break;
3134     case GIMPLE_UNARY_RHS:
3135       reduction_op = gimple_assign_rhs1 (stmt);
3136       break;
3137     case GIMPLE_BINARY_RHS:
3138       reduction_op = reduc_index ?
3139                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3140       break;
3141     default:
3142       gcc_unreachable ();
3143     }
3144
3145   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3146   gcc_assert (vectype);
3147   mode = TYPE_MODE (vectype);
3148
3149   /* 1. Create the reduction def-use cycle:
3150      Set the arguments of REDUCTION_PHIS, i.e., transform
3151
3152         loop:
3153           vec_def = phi <null, null>            # REDUCTION_PHI
3154           VECT_DEF = vector_stmt                # vectorized form of STMT
3155           ...
3156
3157      into:
3158
3159         loop:
3160           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3161           VECT_DEF = vector_stmt                # vectorized form of STMT
3162           ...
3163
3164      (in case of SLP, do it for all the phis). */
3165
3166   /* Get the loop-entry arguments.  */
3167   if (slp_node)
3168     vect_get_slp_defs (slp_node, &vec_initial_defs, NULL, reduc_index);
3169   else
3170     {
3171       vec_initial_defs = VEC_alloc (tree, heap, 1);
3172      /* For the case of reduction, vect_get_vec_def_for_operand returns
3173         the scalar def before the loop, that defines the initial value
3174         of the reduction variable.  */
3175       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3176                                                       &adjustment_def);
3177       VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
3178     }
3179
3180   /* Set phi nodes arguments.  */
3181   for (i = 0; VEC_iterate (gimple, reduction_phis, i, phi); i++)
3182     {
3183       tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
3184       tree def = VEC_index (tree, vect_defs, i);
3185       for (j = 0; j < ncopies; j++)
3186         {
3187           /* Set the loop-entry arg of the reduction-phi.  */
3188           add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3189                        UNKNOWN_LOCATION);
3190
3191           /* Set the loop-latch arg for the reduction-phi.  */
3192           if (j > 0)
3193             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3194
3195           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3196
3197           if (vect_print_dump_info (REPORT_DETAILS))
3198             {
3199               fprintf (vect_dump, "transform reduction: created def-use"
3200                                   " cycle: ");
3201               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
3202               fprintf (vect_dump, "\n");
3203               print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0,
3204                                  TDF_SLIM);
3205             }
3206
3207           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3208         }
3209     }
3210
3211   VEC_free (tree, heap, vec_initial_defs);
3212
3213   /* 2. Create epilog code.
3214         The reduction epilog code operates across the elements of the vector
3215         of partial results computed by the vectorized loop.
3216         The reduction epilog code consists of:
3217
3218         step 1: compute the scalar result in a vector (v_out2)
3219         step 2: extract the scalar result (s_out3) from the vector (v_out2)
3220         step 3: adjust the scalar result (s_out3) if needed.
3221
3222         Step 1 can be accomplished using one the following three schemes:
3223           (scheme 1) using reduc_code, if available.
3224           (scheme 2) using whole-vector shifts, if available.
3225           (scheme 3) using a scalar loop. In this case steps 1+2 above are
3226                      combined.
3227
3228           The overall epilog code looks like this:
3229
3230           s_out0 = phi <s_loop>         # original EXIT_PHI
3231           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
3232           v_out2 = reduce <v_out1>              # step 1
3233           s_out3 = extract_field <v_out2, 0>    # step 2
3234           s_out4 = adjust_result <s_out3>       # step 3
3235
3236           (step 3 is optional, and steps 1 and 2 may be combined).
3237           Lastly, the uses of s_out0 are replaced by s_out4.  */
3238
3239
3240   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3241          v_out1 = phi <VECT_DEF>
3242          Store them in NEW_PHIS.  */
3243
3244   exit_bb = single_exit (loop)->dest;
3245   prev_phi_info = NULL;
3246   new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3247   for (i = 0; VEC_iterate (tree, vect_defs, i, def); i++)
3248     {
3249       for (j = 0; j < ncopies; j++)
3250         {
3251           phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
3252           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3253           if (j == 0)
3254             VEC_quick_push (gimple, new_phis, phi);
3255           else
3256             {
3257               def = vect_get_vec_def_for_stmt_copy (dt, def);
3258               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3259             }
3260
3261           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3262           prev_phi_info = vinfo_for_stmt (phi);
3263         }
3264     }
3265
3266   exit_gsi = gsi_after_labels (exit_bb);
3267
3268   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3269          (i.e. when reduc_code is not available) and in the final adjustment
3270          code (if needed).  Also get the original scalar reduction variable as
3271          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
3272          represents a reduction pattern), the tree-code and scalar-def are
3273          taken from the original stmt that the pattern-stmt (STMT) replaces.
3274          Otherwise (it is a regular reduction) - the tree-code and scalar-def
3275          are taken from STMT.  */
3276
3277   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3278   if (!orig_stmt)
3279     {
3280       /* Regular reduction  */
3281       orig_stmt = stmt;
3282     }
3283   else
3284     {
3285       /* Reduction pattern  */
3286       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3287       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3288       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3289     }
3290
3291   code = gimple_assign_rhs_code (orig_stmt);
3292   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3293      partial results are added and not subtracted.  */
3294   if (code == MINUS_EXPR)
3295     code = PLUS_EXPR;
3296
3297   scalar_dest = gimple_assign_lhs (orig_stmt);
3298   scalar_type = TREE_TYPE (scalar_dest);
3299   scalar_results = VEC_alloc (tree, heap, group_size);
3300   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3301   bitsize = TYPE_SIZE (scalar_type);
3302
3303   /* In case this is a reduction in an inner-loop while vectorizing an outer
3304      loop - we don't need to extract a single scalar result at the end of the
3305      inner-loop (unless it is double reduction, i.e., the use of reduction is
3306      outside the outer-loop). The final vector of partial results will be used
3307      in the vectorized outer-loop, or reduced to a scalar result at the end of
3308      the outer-loop.  */
3309   if (nested_in_vect_loop && !double_reduc)
3310     goto vect_finalize_reduction;
3311
3312   /* 2.3 Create the reduction code, using one of the three schemes described
3313          above. In SLP we simply need to extract all the elements from the
3314          vector (without reducing them), so we use scalar shifts.  */
3315   if (reduc_code != ERROR_MARK && !slp_node)
3316     {
3317       tree tmp;
3318
3319       /*** Case 1:  Create:
3320            v_out2 = reduc_expr <v_out1>  */
3321
3322       if (vect_print_dump_info (REPORT_DETAILS))
3323         fprintf (vect_dump, "Reduce using direct vector reduction.");
3324
3325       vec_dest = vect_create_destination_var (scalar_dest, vectype);
3326       new_phi = VEC_index (gimple, new_phis, 0);
3327       tmp = build1 (reduc_code, vectype,  PHI_RESULT (new_phi));
3328       epilog_stmt = gimple_build_assign (vec_dest, tmp);
3329       new_temp = make_ssa_name (vec_dest, epilog_stmt);
3330       gimple_assign_set_lhs (epilog_stmt, new_temp);
3331       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3332
3333       extract_scalar_result = true;
3334     }
3335   else
3336     {
3337       enum tree_code shift_code = ERROR_MARK;
3338       bool have_whole_vector_shift = true;
3339       int bit_offset;
3340       int element_bitsize = tree_low_cst (bitsize, 1);
3341       int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3342       tree vec_temp;
3343
3344       if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3345         shift_code = VEC_RSHIFT_EXPR;
3346       else
3347         have_whole_vector_shift = false;
3348
3349       /* Regardless of whether we have a whole vector shift, if we're
3350          emulating the operation via tree-vect-generic, we don't want
3351          to use it.  Only the first round of the reduction is likely
3352          to still be profitable via emulation.  */
3353       /* ??? It might be better to emit a reduction tree code here, so that
3354          tree-vect-generic can expand the first round via bit tricks.  */
3355       if (!VECTOR_MODE_P (mode))
3356         have_whole_vector_shift = false;
3357       else
3358         {
3359           optab optab = optab_for_tree_code (code, vectype, optab_default);
3360           if (optab_handler (optab, mode) == CODE_FOR_nothing)
3361             have_whole_vector_shift = false;
3362         }
3363
3364       if (have_whole_vector_shift && !slp_node)
3365         {
3366           /*** Case 2: Create:
3367              for (offset = VS/2; offset >= element_size; offset/=2)
3368                 {
3369                   Create:  va' = vec_shift <va, offset>
3370                   Create:  va = vop <va, va'>
3371                 }  */
3372
3373           if (vect_print_dump_info (REPORT_DETAILS))
3374             fprintf (vect_dump, "Reduce using vector shifts");
3375
3376           vec_dest = vect_create_destination_var (scalar_dest, vectype);
3377           new_phi = VEC_index (gimple, new_phis, 0);
3378           new_temp = PHI_RESULT (new_phi);
3379           for (bit_offset = vec_size_in_bits/2;
3380                bit_offset >= element_bitsize;
3381                bit_offset /= 2)
3382             {
3383               tree bitpos = size_int (bit_offset);
3384
3385               epilog_stmt = gimple_build_assign_with_ops (shift_code,
3386                                                vec_dest, new_temp, bitpos);
3387               new_name = make_ssa_name (vec_dest, epilog_stmt);
3388               gimple_assign_set_lhs (epilog_stmt, new_name);
3389               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3390
3391               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3392                                                           new_name, new_temp);
3393               new_temp = make_ssa_name (vec_dest, epilog_stmt);
3394               gimple_assign_set_lhs (epilog_stmt, new_temp);
3395               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3396             }
3397
3398           extract_scalar_result = true;
3399         }
3400       else
3401         {
3402           tree rhs;
3403
3404           /*** Case 3: Create:
3405              s = extract_field <v_out2, 0>
3406              for (offset = element_size;
3407                   offset < vector_size;
3408                   offset += element_size;)
3409                {
3410                  Create:  s' = extract_field <v_out2, offset>
3411                  Create:  s = op <s, s'>  // For non SLP cases
3412                }  */
3413
3414           if (vect_print_dump_info (REPORT_DETAILS))
3415             fprintf (vect_dump, "Reduce using scalar code. ");
3416
3417           vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3418           for (i = 0; VEC_iterate (gimple, new_phis, i, new_phi); i++)
3419             {
3420               vec_temp = PHI_RESULT (new_phi);
3421               rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3422                             bitsize_zero_node);
3423               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3424               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3425               gimple_assign_set_lhs (epilog_stmt, new_temp);
3426               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3427
3428               /* In SLP we don't need to apply reduction operation, so we just
3429                  collect s' values in SCALAR_RESULTS.  */
3430               if (slp_node)
3431                 VEC_safe_push (tree, heap, scalar_results, new_temp);
3432
3433               for (bit_offset = element_bitsize;
3434                    bit_offset < vec_size_in_bits;
3435                    bit_offset += element_bitsize)
3436                 {
3437                   tree bitpos = bitsize_int (bit_offset);
3438                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
3439                                      bitsize, bitpos);
3440
3441                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3442                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3443                   gimple_assign_set_lhs (epilog_stmt, new_name);
3444                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3445
3446                   if (slp_node)
3447                     {
3448                       /* In SLP we don't need to apply reduction operation, so
3449                          we just collect s' values in SCALAR_RESULTS.  */
3450                       new_temp = new_name;
3451                       VEC_safe_push (tree, heap, scalar_results, new_name);
3452                     }
3453                   else
3454                     {
3455                       epilog_stmt = gimple_build_assign_with_ops (code,
3456                                           new_scalar_dest, new_name, new_temp);
3457                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3458                       gimple_assign_set_lhs (epilog_stmt, new_temp);
3459                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3460                     }
3461                 }
3462             }
3463
3464           /* The only case where we need to reduce scalar results in SLP, is
3465              unrolling. If the size of SCALAR_RESULTS is greater than
3466              GROUP_SIZE, we reduce them combining elements modulo
3467              GROUP_SIZE.  */
3468           if (slp_node)
3469             {
3470               tree res, first_res, new_res;
3471               gimple new_stmt;
3472
3473               /* Reduce multiple scalar results in case of SLP unrolling.  */
3474               for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
3475                    j++)
3476                 {
3477                   first_res = VEC_index (tree, scalar_results, j % group_size);
3478                   new_stmt = gimple_build_assign_with_ops (code,
3479                                               new_scalar_dest, first_res, res);
3480                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
3481                   gimple_assign_set_lhs (new_stmt, new_res);
3482                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
3483                   VEC_replace (tree, scalar_results, j % group_size, new_res);
3484                 }
3485             }
3486           else
3487             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
3488             VEC_safe_push (tree, heap, scalar_results, new_temp);
3489
3490           extract_scalar_result = false;
3491         }
3492     }
3493
3494   /* 2.4  Extract the final scalar result.  Create:
3495           s_out3 = extract_field <v_out2, bitpos>  */
3496
3497   if (extract_scalar_result)
3498     {
3499       tree rhs;
3500
3501       if (vect_print_dump_info (REPORT_DETAILS))
3502         fprintf (vect_dump, "extract scalar result");
3503
3504       if (BYTES_BIG_ENDIAN)
3505         bitpos = size_binop (MULT_EXPR,
3506                              bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3507                              TYPE_SIZE (scalar_type));
3508       else
3509         bitpos = bitsize_zero_node;
3510
3511       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3512       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3513       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3514       gimple_assign_set_lhs (epilog_stmt, new_temp);
3515       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3516       VEC_safe_push (tree, heap, scalar_results, new_temp);
3517     }
3518
3519 vect_finalize_reduction:
3520
3521   /* 2.5 Adjust the final result by the initial value of the reduction
3522          variable. (When such adjustment is not needed, then
3523          'adjustment_def' is zero).  For example, if code is PLUS we create:
3524          new_temp = loop_exit_def + adjustment_def  */
3525
3526   if (adjustment_def)
3527     {
3528       gcc_assert (!slp_node);
3529       if (nested_in_vect_loop)
3530         {
3531           new_phi = VEC_index (gimple, new_phis, 0);
3532           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3533           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3534           new_dest = vect_create_destination_var (scalar_dest, vectype);
3535         }
3536       else
3537         {
3538           new_temp = VEC_index (tree, scalar_results, 0);
3539           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3540           expr = build2 (code, scalar_type, new_temp, adjustment_def);
3541           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3542         }
3543
3544       epilog_stmt = gimple_build_assign (new_dest, expr);
3545       new_temp = make_ssa_name (new_dest, epilog_stmt);
3546       gimple_assign_set_lhs (epilog_stmt, new_temp);
3547       SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3548       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3549       if (nested_in_vect_loop)
3550         {
3551           set_vinfo_for_stmt (epilog_stmt,
3552                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
3553                                                  NULL));
3554           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3555                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3556
3557           if (!double_reduc)
3558             VEC_quick_push (tree, scalar_results, new_temp);
3559           else
3560             VEC_replace (tree, scalar_results, 0, new_temp);
3561         }
3562       else
3563         VEC_replace (tree, scalar_results, 0, new_temp);
3564
3565       VEC_replace (gimple, new_phis, 0, epilog_stmt);
3566     }
3567
3568   /* 2.6  Handle the loop-exit phis. Replace the uses of scalar loop-exit
3569           phis with new adjusted scalar results, i.e., replace use <s_out0>
3570           with use <s_out4>.
3571
3572      Transform:
3573         loop_exit:
3574           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3575           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3576           v_out2 = reduce <v_out1>
3577           s_out3 = extract_field <v_out2, 0>
3578           s_out4 = adjust_result <s_out3>
3579           use <s_out0>
3580           use <s_out0>
3581
3582      into:
3583
3584         loop_exit:
3585           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3586           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3587           v_out2 = reduce <v_out1>
3588           s_out3 = extract_field <v_out2, 0>
3589           s_out4 = adjust_result <s_out3>
3590           use <s_out4>
3591           use <s_out4> */
3592
3593   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
3594      case that GROUP_SIZE is greater than vectorization factor). Therefore, we
3595      need to match SCALAR_RESULTS with corresponding statements. The first
3596      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
3597      the first vector stmt, etc.
3598      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
3599   if (group_size > VEC_length (gimple, new_phis))
3600     {
3601       ratio = group_size / VEC_length (gimple, new_phis);
3602       gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
3603     }
3604   else
3605     ratio = 1;
3606
3607   for (k = 0; k < group_size; k++)
3608     {
3609       if (k % ratio == 0)
3610         {
3611           epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
3612           reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
3613         }
3614
3615       if (slp_node)
3616         {
3617           gimple current_stmt = VEC_index (gimple,
3618                                        SLP_TREE_SCALAR_STMTS (slp_node), k);
3619
3620           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
3621           /* SLP statements can't participate in patterns.  */
3622           gcc_assert (!orig_stmt);
3623           scalar_dest = gimple_assign_lhs (current_stmt);
3624         }
3625
3626       phis = VEC_alloc (gimple, heap, 3);
3627       /* Find the loop-closed-use at the loop exit of the original scalar
3628          result. (The reduction result is expected to have two immediate uses -
3629          one at the latch block, and one at the loop exit).  */
3630       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3631         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3632           VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
3633
3634       /* We expect to have found an exit_phi because of loop-closed-ssa
3635          form.  */
3636       gcc_assert (!VEC_empty (gimple, phis));
3637
3638       for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
3639         {
3640           if (outer_loop)
3641             {
3642               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
3643               gimple vect_phi;
3644
3645               /* FORNOW. Currently not supporting the case that an inner-loop
3646                  reduction is not used in the outer-loop (but only outside the
3647                  outer-loop), unless it is double reduction.  */
3648               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
3649                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
3650                           || double_reduc);
3651
3652               STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
3653               if (!double_reduc
3654                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
3655                       != vect_double_reduction_def)
3656                 continue;
3657
3658               /* Handle double reduction:
3659
3660                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
3661                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
3662                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
3663                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
3664
3665                  At that point the regular reduction (stmt2 and stmt3) is
3666                  already vectorized, as well as the exit phi node, stmt4.
3667                  Here we vectorize the phi node of double reduction, stmt1, and
3668                  update all relevant statements.  */
3669
3670               /* Go through all the uses of s2 to find double reduction phi
3671                  node, i.e., stmt1 above.  */
3672               orig_name = PHI_RESULT (exit_phi);
3673               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3674                 {
3675                   stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
3676                   stmt_vec_info new_phi_vinfo;
3677                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
3678                   basic_block bb = gimple_bb (use_stmt);
3679                   gimple use;
3680
3681                   /* Check that USE_STMT is really double reduction phi
3682                      node.  */
3683                   if (gimple_code (use_stmt) != GIMPLE_PHI
3684                       || gimple_phi_num_args (use_stmt) != 2
3685                       || !use_stmt_vinfo
3686                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
3687                           != vect_double_reduction_def
3688                       || bb->loop_father != outer_loop)
3689                     continue;
3690
3691                   /* Create vector phi node for double reduction:
3692                      vs1 = phi <vs0, vs2>
3693                      vs1 was created previously in this function by a call to
3694                        vect_get_vec_def_for_operand and is stored in
3695                        vec_initial_def;
3696                      vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
3697                      vs0 is created here.  */
3698
3699                   /* Create vector phi node.  */
3700                   vect_phi = create_phi_node (vec_initial_def, bb);
3701                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
3702                                     loop_vec_info_for_loop (outer_loop), NULL);
3703                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
3704
3705                   /* Create vs0 - initial def of the double reduction phi.  */
3706                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
3707                                              loop_preheader_edge (outer_loop));
3708                   init_def = get_initial_def_for_reduction (stmt,
3709                                                           preheader_arg, NULL);
3710                   vect_phi_init = vect_init_vector (use_stmt, init_def,
3711                                                     vectype, NULL);
3712
3713                   /* Update phi node arguments with vs0 and vs2.  */
3714                   add_phi_arg (vect_phi, vect_phi_init,
3715                                loop_preheader_edge (outer_loop),
3716                                UNKNOWN_LOCATION);
3717                   add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
3718                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
3719                   if (vect_print_dump_info (REPORT_DETAILS))
3720                     {
3721                       fprintf (vect_dump, "created double reduction phi "
3722                                           "node: ");
3723                       print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
3724                     }
3725
3726                   vect_phi_res = PHI_RESULT (vect_phi);
3727
3728                   /* Replace the use, i.e., set the correct vs1 in the regular
3729                      reduction phi node. FORNOW, NCOPIES is always 1, so the
3730                      loop is redundant.  */
3731                   use = reduction_phi;
3732                   for (j = 0; j < ncopies; j++)
3733                     {
3734                       edge pr_edge = loop_preheader_edge (loop);
3735                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
3736                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
3737                     }
3738                 }
3739             }
3740
3741           /* Replace the uses:  */
3742           orig_name = PHI_RESULT (exit_phi);
3743           scalar_result = VEC_index (tree, scalar_results, k);
3744           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3745             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3746               SET_USE (use_p, scalar_result);
3747         }
3748
3749       VEC_free (gimple, heap, phis);
3750     }
3751
3752   VEC_free (tree, heap, scalar_results);
3753   VEC_free (gimple, heap, new_phis);
3754 }
3755
3756
3757 /* Function vectorizable_reduction.
3758
3759    Check if STMT performs a reduction operation that can be vectorized.
3760    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3761    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
3762    Return FALSE if not a vectorizable STMT, TRUE otherwise.
3763
3764    This function also handles reduction idioms (patterns) that have been
3765    recognized in advance during vect_pattern_recog. In this case, STMT may be
3766    of this form:
3767      X = pattern_expr (arg0, arg1, ..., X)
3768    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
3769    sequence that had been detected and replaced by the pattern-stmt (STMT).
3770
3771    In some cases of reduction patterns, the type of the reduction variable X is
3772    different than the type of the other arguments of STMT.
3773    In such cases, the vectype that is used when transforming STMT into a vector
3774    stmt is different than the vectype that is used to determine the
3775    vectorization factor, because it consists of a different number of elements
3776    than the actual number of elements that are being operated upon in parallel.
3777
3778    For example, consider an accumulation of shorts into an int accumulator.
3779    On some targets it's possible to vectorize this pattern operating on 8
3780    shorts at a time (hence, the vectype for purposes of determining the
3781    vectorization factor should be V8HI); on the other hand, the vectype that
3782    is used to create the vector form is actually V4SI (the type of the result).
3783
3784    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
3785    indicates what is the actual level of parallelism (V8HI in the example), so
3786    that the right vectorization factor would be derived. This vectype
3787    corresponds to the type of arguments to the reduction stmt, and should *NOT*
3788    be used to create the vectorized stmt. The right vectype for the vectorized
3789    stmt is obtained from the type of the result X:
3790         get_vectype_for_scalar_type (TREE_TYPE (X))
3791
3792    This means that, contrary to "regular" reductions (or "regular" stmts in
3793    general), the following equation:
3794       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
3795    does *NOT* necessarily hold for reduction patterns.  */
3796
3797 bool
3798 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
3799                         gimple *vec_stmt, slp_tree slp_node)
3800 {
3801   tree vec_dest;
3802   tree scalar_dest;
3803   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
3804   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3805   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
3806   tree vectype_in = NULL_TREE;
3807   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3808   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3809   enum tree_code code, orig_code, epilog_reduc_code;
3810   enum machine_mode vec_mode;
3811   int op_type;
3812   optab optab, reduc_optab;
3813   tree new_temp = NULL_TREE;
3814   tree def;
3815   gimple def_stmt;
3816   enum vect_def_type dt;
3817   gimple new_phi = NULL;
3818   tree scalar_type;
3819   bool is_simple_use;
3820   gimple orig_stmt;
3821   stmt_vec_info orig_stmt_info;
3822   tree expr = NULL_TREE;
3823   int i;
3824   int ncopies;
3825   int epilog_copies;
3826   stmt_vec_info prev_stmt_info, prev_phi_info;
3827   bool single_defuse_cycle = false;
3828   tree reduc_def = NULL_TREE;
3829   gimple new_stmt = NULL;
3830   int j;
3831   tree ops[3];
3832   bool nested_cycle = false, found_nested_cycle_def = false;
3833   gimple reduc_def_stmt = NULL;
3834   /* The default is that the reduction variable is the last in statement.  */
3835   int reduc_index = 2;
3836   bool double_reduc = false, dummy;
3837   basic_block def_bb;
3838   struct loop * def_stmt_loop, *outer_loop = NULL;
3839   tree def_arg;
3840   gimple def_arg_stmt;
3841   VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
3842   VEC (gimple, heap) *phis = NULL;
3843   int vec_num;
3844   tree def0, def1;
3845
3846   if (nested_in_vect_loop_p (loop, stmt))
3847     {
3848       outer_loop = loop;
3849       loop = loop->inner;
3850       nested_cycle = true;
3851     }
3852
3853   /* 1. Is vectorizable reduction?  */
3854   /* Not supportable if the reduction variable is used in the loop.  */
3855   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
3856     return false;
3857
3858   /* Reductions that are not used even in an enclosing outer-loop,
3859      are expected to be "live" (used out of the loop).  */
3860   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
3861       && !STMT_VINFO_LIVE_P (stmt_info))
3862     return false;
3863
3864   /* Make sure it was already recognized as a reduction computation.  */
3865   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
3866       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
3867     return false;
3868
3869   /* 2. Has this been recognized as a reduction pattern?
3870
3871      Check if STMT represents a pattern that has been recognized
3872      in earlier analysis stages.  For stmts that represent a pattern,
3873      the STMT_VINFO_RELATED_STMT field records the last stmt in
3874      the original sequence that constitutes the pattern.  */
3875
3876   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3877   if (orig_stmt)
3878     {
3879       orig_stmt_info = vinfo_for_stmt (orig_stmt);
3880       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
3881       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
3882       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
3883     }
3884
3885   /* 3. Check the operands of the operation. The first operands are defined
3886         inside the loop body. The last operand is the reduction variable,
3887         which is defined by the loop-header-phi.  */
3888
3889   gcc_assert (is_gimple_assign (stmt));
3890
3891   /* Flatten RHS */
3892   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3893     {
3894     case GIMPLE_SINGLE_RHS:
3895       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
3896       if (op_type == ternary_op)
3897         {
3898           tree rhs = gimple_assign_rhs1 (stmt);
3899           ops[0] = TREE_OPERAND (rhs, 0);
3900           ops[1] = TREE_OPERAND (rhs, 1);
3901           ops[2] = TREE_OPERAND (rhs, 2);
3902           code = TREE_CODE (rhs);
3903         }
3904       else
3905         return false;
3906       break;
3907
3908     case GIMPLE_BINARY_RHS:
3909       code = gimple_assign_rhs_code (stmt);
3910       op_type = TREE_CODE_LENGTH (code);
3911       gcc_assert (op_type == binary_op);
3912       ops[0] = gimple_assign_rhs1 (stmt);
3913       ops[1] = gimple_assign_rhs2 (stmt);
3914       break;
3915
3916     case GIMPLE_UNARY_RHS:
3917       return false;
3918
3919     default:
3920       gcc_unreachable ();
3921     }
3922
3923   scalar_dest = gimple_assign_lhs (stmt);
3924   scalar_type = TREE_TYPE (scalar_dest);
3925   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
3926       && !SCALAR_FLOAT_TYPE_P (scalar_type))
3927     return false;
3928
3929   /* All uses but the last are expected to be defined in the loop.
3930      The last use is the reduction variable. In case of nested cycle this
3931      assumption is not true: we use reduc_index to record the index of the
3932      reduction variable.  */
3933   for (i = 0; i < op_type-1; i++)
3934     {
3935       tree tem;
3936
3937       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
3938       if (i == 0 && code == COND_EXPR)
3939         continue;
3940
3941       is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL,
3942                                             &def_stmt, &def, &dt, &tem);
3943       if (!vectype_in)
3944         vectype_in = tem;
3945       gcc_assert (is_simple_use);
3946       if (dt != vect_internal_def
3947           && dt != vect_external_def
3948           && dt != vect_constant_def
3949           && dt != vect_induction_def
3950           && !(dt == vect_nested_cycle && nested_cycle))
3951         return false;
3952
3953       if (dt == vect_nested_cycle)
3954         {
3955           found_nested_cycle_def = true;
3956           reduc_def_stmt = def_stmt;
3957           reduc_index = i;
3958         }
3959     }
3960
3961   is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
3962                                       &def, &dt);
3963   gcc_assert (is_simple_use);
3964   gcc_assert (dt == vect_reduction_def
3965               || dt == vect_nested_cycle
3966               || ((dt == vect_internal_def || dt == vect_external_def
3967                    || dt == vect_constant_def || dt == vect_induction_def)
3968                    && nested_cycle && found_nested_cycle_def));
3969   if (!found_nested_cycle_def)
3970     reduc_def_stmt = def_stmt;
3971
3972   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
3973   if (orig_stmt)
3974     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
3975                                                        reduc_def_stmt,
3976                                                        !nested_cycle,
3977                                                        &dummy));
3978   else
3979     gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
3980                                                   !nested_cycle, &dummy));
3981
3982   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
3983     return false;
3984
3985   if (slp_node)
3986     ncopies = 1;
3987   else
3988     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3989                / TYPE_VECTOR_SUBPARTS (vectype_in));
3990
3991   gcc_assert (ncopies >= 1);
3992
3993   vec_mode = TYPE_MODE (vectype_in);
3994
3995   if (code == COND_EXPR)
3996     {
3997       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0))
3998         {
3999           if (vect_print_dump_info (REPORT_DETAILS))
4000             fprintf (vect_dump, "unsupported condition in reduction");
4001
4002             return false;
4003         }
4004     }
4005   else
4006     {
4007       /* 4. Supportable by target?  */
4008
4009       /* 4.1. check support for the operation in the loop  */
4010       optab = optab_for_tree_code (code, vectype_in, optab_default);
4011       if (!optab)
4012         {
4013           if (vect_print_dump_info (REPORT_DETAILS))
4014             fprintf (vect_dump, "no optab.");
4015
4016           return false;
4017         }
4018
4019       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4020         {
4021           if (vect_print_dump_info (REPORT_DETAILS))
4022             fprintf (vect_dump, "op not supported by target.");
4023
4024           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4025               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4026                   < vect_min_worthwhile_factor (code))
4027             return false;
4028
4029           if (vect_print_dump_info (REPORT_DETAILS))
4030             fprintf (vect_dump, "proceeding using word mode.");
4031         }
4032
4033       /* Worthwhile without SIMD support?  */
4034       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4035           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4036              < vect_min_worthwhile_factor (code))
4037         {
4038           if (vect_print_dump_info (REPORT_DETAILS))
4039             fprintf (vect_dump, "not worthwhile without SIMD support.");
4040
4041           return false;
4042         }
4043     }
4044
4045   /* 4.2. Check support for the epilog operation.
4046
4047           If STMT represents a reduction pattern, then the type of the
4048           reduction variable may be different than the type of the rest
4049           of the arguments.  For example, consider the case of accumulation
4050           of shorts into an int accumulator; The original code:
4051                         S1: int_a = (int) short_a;
4052           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
4053
4054           was replaced with:
4055                         STMT: int_acc = widen_sum <short_a, int_acc>
4056
4057           This means that:
4058           1. The tree-code that is used to create the vector operation in the
4059              epilog code (that reduces the partial results) is not the
4060              tree-code of STMT, but is rather the tree-code of the original
4061              stmt from the pattern that STMT is replacing. I.e, in the example
4062              above we want to use 'widen_sum' in the loop, but 'plus' in the
4063              epilog.
4064           2. The type (mode) we use to check available target support
4065              for the vector operation to be created in the *epilog*, is
4066              determined by the type of the reduction variable (in the example
4067              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4068              However the type (mode) we use to check available target support
4069              for the vector operation to be created *inside the loop*, is
4070              determined by the type of the other arguments to STMT (in the
4071              example we'd check this: optab_handler (widen_sum_optab,
4072              vect_short_mode)).
4073
4074           This is contrary to "regular" reductions, in which the types of all
4075           the arguments are the same as the type of the reduction variable.
4076           For "regular" reductions we can therefore use the same vector type
4077           (and also the same tree-code) when generating the epilog code and
4078           when generating the code inside the loop.  */
4079
4080   if (orig_stmt)
4081     {
4082       /* This is a reduction pattern: get the vectype from the type of the
4083          reduction variable, and get the tree-code from orig_stmt.  */
4084       orig_code = gimple_assign_rhs_code (orig_stmt);
4085       gcc_assert (vectype_out);
4086       vec_mode = TYPE_MODE (vectype_out);
4087     }
4088   else
4089     {
4090       /* Regular reduction: use the same vectype and tree-code as used for
4091          the vector code inside the loop can be used for the epilog code. */
4092       orig_code = code;
4093     }
4094
4095   if (nested_cycle)
4096     {
4097       def_bb = gimple_bb (reduc_def_stmt);
4098       def_stmt_loop = def_bb->loop_father;
4099       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4100                                        loop_preheader_edge (def_stmt_loop));
4101       if (TREE_CODE (def_arg) == SSA_NAME
4102           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4103           && gimple_code (def_arg_stmt) == GIMPLE_PHI
4104           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4105           && vinfo_for_stmt (def_arg_stmt)
4106           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4107               == vect_double_reduction_def)
4108         double_reduc = true;
4109     }
4110
4111   epilog_reduc_code = ERROR_MARK;
4112   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4113     {
4114       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4115                                          optab_default);
4116       if (!reduc_optab)
4117         {
4118           if (vect_print_dump_info (REPORT_DETAILS))
4119             fprintf (vect_dump, "no optab for reduction.");
4120
4121           epilog_reduc_code = ERROR_MARK;
4122         }
4123
4124       if (reduc_optab
4125           && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4126         {
4127           if (vect_print_dump_info (REPORT_DETAILS))
4128             fprintf (vect_dump, "reduc op not supported by target.");
4129
4130           epilog_reduc_code = ERROR_MARK;
4131         }
4132     }
4133   else
4134     {
4135       if (!nested_cycle || double_reduc)
4136         {
4137           if (vect_print_dump_info (REPORT_DETAILS))
4138             fprintf (vect_dump, "no reduc code for scalar code.");
4139
4140           return false;
4141         }
4142     }
4143
4144   if (double_reduc && ncopies > 1)
4145     {
4146       if (vect_print_dump_info (REPORT_DETAILS))
4147         fprintf (vect_dump, "multiple types in double reduction");
4148
4149       return false;
4150     }
4151
4152   if (!vec_stmt) /* transformation not required.  */
4153     {
4154       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4155       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4156         return false;
4157       return true;
4158     }
4159
4160   /** Transform.  **/
4161
4162   if (vect_print_dump_info (REPORT_DETAILS))
4163     fprintf (vect_dump, "transform reduction.");
4164
4165   /* FORNOW: Multiple types are not supported for condition.  */
4166   if (code == COND_EXPR)
4167     gcc_assert (ncopies == 1);
4168
4169   /* Create the destination vector  */
4170   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4171
4172   /* In case the vectorization factor (VF) is bigger than the number
4173      of elements that we can fit in a vectype (nunits), we have to generate
4174      more than one vector stmt - i.e - we need to "unroll" the
4175      vector stmt by a factor VF/nunits.  For more details see documentation
4176      in vectorizable_operation.  */
4177
4178   /* If the reduction is used in an outer loop we need to generate
4179      VF intermediate results, like so (e.g. for ncopies=2):
4180         r0 = phi (init, r0)
4181         r1 = phi (init, r1)
4182         r0 = x0 + r0;
4183         r1 = x1 + r1;
4184     (i.e. we generate VF results in 2 registers).
4185     In this case we have a separate def-use cycle for each copy, and therefore
4186     for each copy we get the vector def for the reduction variable from the
4187     respective phi node created for this copy.
4188
4189     Otherwise (the reduction is unused in the loop nest), we can combine
4190     together intermediate results, like so (e.g. for ncopies=2):
4191         r = phi (init, r)
4192         r = x0 + r;
4193         r = x1 + r;
4194    (i.e. we generate VF/2 results in a single register).
4195    In this case for each copy we get the vector def for the reduction variable
4196    from the vectorized reduction operation generated in the previous iteration.
4197   */
4198
4199   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
4200     {
4201       single_defuse_cycle = true;
4202       epilog_copies = 1;
4203     }
4204   else
4205     epilog_copies = ncopies;
4206
4207   prev_stmt_info = NULL;
4208   prev_phi_info = NULL;
4209   if (slp_node)
4210     {
4211       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4212       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
4213                   == TYPE_VECTOR_SUBPARTS (vectype_in));
4214     }
4215   else
4216     {
4217       vec_num = 1;
4218       vec_oprnds0 = VEC_alloc (tree, heap, 1);
4219       if (op_type == ternary_op)
4220         vec_oprnds1 = VEC_alloc (tree, heap, 1);
4221     }
4222
4223   phis = VEC_alloc (gimple, heap, vec_num);
4224   vect_defs = VEC_alloc (tree, heap, vec_num);
4225   if (!slp_node)
4226     VEC_quick_push (tree, vect_defs, NULL_TREE);
4227
4228   for (j = 0; j < ncopies; j++)
4229     {
4230       if (j == 0 || !single_defuse_cycle)
4231         {
4232           for (i = 0; i < vec_num; i++)
4233             {
4234               /* Create the reduction-phi that defines the reduction
4235                  operand.  */
4236               new_phi = create_phi_node (vec_dest, loop->header);
4237               set_vinfo_for_stmt (new_phi,
4238                                   new_stmt_vec_info (new_phi, loop_vinfo,
4239                                                      NULL));
4240                if (j == 0 || slp_node)
4241                  VEC_quick_push (gimple, phis, new_phi);
4242             }
4243         }
4244
4245       if (code == COND_EXPR)
4246         {
4247           gcc_assert (!slp_node);
4248           vectorizable_condition (stmt, gsi, vec_stmt,
4249                                   PHI_RESULT (VEC_index (gimple, phis, 0)),
4250                                   reduc_index);
4251           /* Multiple types are not supported for condition.  */
4252           break;
4253         }
4254
4255       /* Handle uses.  */
4256       if (j == 0)
4257         {
4258           if (slp_node)
4259             vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1, -1);
4260           else
4261             {
4262               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
4263                                                             stmt, NULL);
4264               VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
4265               if (op_type == ternary_op)
4266                {
4267                  if (reduc_index == 0)
4268                    loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt,
4269                                                                  NULL);
4270                  else
4271                    loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt,
4272                                                                  NULL);
4273
4274                  VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
4275                }
4276             }
4277         }
4278       else
4279         {
4280           if (!slp_node)
4281             {
4282               enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
4283               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
4284               VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
4285               if (op_type == ternary_op)
4286                 {
4287                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
4288                                                                 loop_vec_def1);
4289                   VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
4290                 }
4291             }
4292
4293           if (single_defuse_cycle)
4294             reduc_def = gimple_assign_lhs (new_stmt);
4295
4296           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
4297         }
4298
4299       for (i = 0; VEC_iterate (tree, vec_oprnds0, i, def0); i++)
4300         {
4301           if (slp_node)
4302             reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
4303           else
4304             {
4305               if (!single_defuse_cycle || j == 0)
4306                 reduc_def = PHI_RESULT (new_phi);
4307             }
4308
4309           def1 = ((op_type == ternary_op)
4310                   ? VEC_index (tree, vec_oprnds1, i) : NULL);
4311           if (op_type == binary_op)
4312             {
4313               if (reduc_index == 0)
4314                 expr = build2 (code, vectype_out, reduc_def, def0);
4315               else
4316                 expr = build2 (code, vectype_out, def0, reduc_def);
4317             }
4318           else
4319             {
4320               if (reduc_index == 0)
4321                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
4322               else
4323                 {
4324                   if (reduc_index == 1)
4325                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
4326                   else
4327                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
4328                 }
4329             }
4330
4331           new_stmt = gimple_build_assign (vec_dest, expr);
4332           new_temp = make_ssa_name (vec_dest, new_stmt);
4333           gimple_assign_set_lhs (new_stmt, new_temp);
4334           vect_finish_stmt_generation (stmt, new_stmt, gsi);
4335           if (slp_node)
4336             {
4337               VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4338               VEC_quick_push (tree, vect_defs, new_temp);
4339             }
4340           else
4341             VEC_replace (tree, vect_defs, 0, new_temp);
4342         }
4343
4344       if (slp_node)
4345         continue;
4346
4347       if (j == 0)
4348         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4349       else
4350         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4351
4352       prev_stmt_info = vinfo_for_stmt (new_stmt);
4353       prev_phi_info = vinfo_for_stmt (new_phi);
4354     }
4355
4356   /* Finalize the reduction-phi (set its arguments) and create the
4357      epilog reduction code.  */
4358   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
4359     {
4360       new_temp = gimple_assign_lhs (*vec_stmt);
4361       VEC_replace (tree, vect_defs, 0, new_temp);
4362     }
4363
4364   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
4365                                     epilog_reduc_code, phis, reduc_index,
4366                                     double_reduc, slp_node);
4367
4368   VEC_free (gimple, heap, phis);
4369   VEC_free (tree, heap, vec_oprnds0);
4370   if (vec_oprnds1)
4371     VEC_free (tree, heap, vec_oprnds1);
4372
4373   return true;
4374 }
4375
4376 /* Function vect_min_worthwhile_factor.
4377
4378    For a loop where we could vectorize the operation indicated by CODE,
4379    return the minimum vectorization factor that makes it worthwhile
4380    to use generic vectors.  */
4381 int
4382 vect_min_worthwhile_factor (enum tree_code code)
4383 {
4384   switch (code)
4385     {
4386     case PLUS_EXPR:
4387     case MINUS_EXPR:
4388     case NEGATE_EXPR:
4389       return 4;
4390
4391     case BIT_AND_EXPR:
4392     case BIT_IOR_EXPR:
4393     case BIT_XOR_EXPR:
4394     case BIT_NOT_EXPR:
4395       return 2;
4396
4397     default:
4398       return INT_MAX;
4399     }
4400 }
4401
4402
4403 /* Function vectorizable_induction
4404
4405    Check if PHI performs an induction computation that can be vectorized.
4406    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
4407    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
4408    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
4409
4410 bool
4411 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4412                         gimple *vec_stmt)
4413 {
4414   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
4415   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4416   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4417   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4418   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4419   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4420   tree vec_def;
4421
4422   gcc_assert (ncopies >= 1);
4423   /* FORNOW. This restriction should be relaxed.  */
4424   if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
4425     {
4426       if (vect_print_dump_info (REPORT_DETAILS))
4427         fprintf (vect_dump, "multiple types in nested loop.");
4428       return false;
4429     }
4430
4431   if (!STMT_VINFO_RELEVANT_P (stmt_info))
4432     return false;
4433
4434   /* FORNOW: SLP not supported.  */
4435   if (STMT_SLP_TYPE (stmt_info))
4436     return false;
4437
4438   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
4439
4440   if (gimple_code (phi) != GIMPLE_PHI)
4441     return false;
4442
4443   if (!vec_stmt) /* transformation not required.  */
4444     {
4445       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
4446       if (vect_print_dump_info (REPORT_DETAILS))
4447         fprintf (vect_dump, "=== vectorizable_induction ===");
4448       vect_model_induction_cost (stmt_info, ncopies);
4449       return true;
4450     }
4451
4452   /** Transform.  **/
4453
4454   if (vect_print_dump_info (REPORT_DETAILS))
4455     fprintf (vect_dump, "transform induction phi.");
4456
4457   vec_def = get_initial_def_for_induction (phi);
4458   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
4459   return true;
4460 }
4461
4462 /* Function vectorizable_live_operation.
4463
4464    STMT computes a value that is used outside the loop. Check if
4465    it can be supported.  */
4466
4467 bool
4468 vectorizable_live_operation (gimple stmt,
4469                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4470                              gimple *vec_stmt ATTRIBUTE_UNUSED)
4471 {
4472   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4473   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4474   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4475   int i;
4476   int op_type;
4477   tree op;
4478   tree def;
4479   gimple def_stmt;
4480   enum vect_def_type dt;
4481   enum tree_code code;
4482   enum gimple_rhs_class rhs_class;
4483
4484   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
4485
4486   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
4487     return false;
4488
4489   if (!is_gimple_assign (stmt))
4490     return false;
4491
4492   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4493     return false;
4494
4495   /* FORNOW. CHECKME. */
4496   if (nested_in_vect_loop_p (loop, stmt))
4497     return false;
4498
4499   code = gimple_assign_rhs_code (stmt);
4500   op_type = TREE_CODE_LENGTH (code);
4501   rhs_class = get_gimple_rhs_class (code);
4502   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
4503   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
4504
4505   /* FORNOW: support only if all uses are invariant. This means
4506      that the scalar operations can remain in place, unvectorized.
4507      The original last scalar value that they compute will be used.  */
4508
4509   for (i = 0; i < op_type; i++)
4510     {
4511       if (rhs_class == GIMPLE_SINGLE_RHS)
4512         op = TREE_OPERAND (gimple_op (stmt, 1), i);
4513       else
4514         op = gimple_op (stmt, i + 1);
4515       if (op
4516           && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
4517         {
4518           if (vect_print_dump_info (REPORT_DETAILS))
4519             fprintf (vect_dump, "use not simple.");
4520           return false;
4521         }
4522
4523       if (dt != vect_external_def && dt != vect_constant_def)
4524         return false;
4525     }
4526
4527   /* No transformation is required for the cases we currently support.  */
4528   return true;
4529 }
4530
4531 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
4532
4533 static void
4534 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
4535 {
4536   ssa_op_iter op_iter;
4537   imm_use_iterator imm_iter;
4538   def_operand_p def_p;
4539   gimple ustmt;
4540
4541   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
4542     {
4543       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
4544         {
4545           basic_block bb;
4546
4547           if (!is_gimple_debug (ustmt))
4548             continue;
4549
4550           bb = gimple_bb (ustmt);
4551
4552           if (!flow_bb_inside_loop_p (loop, bb))
4553             {
4554               if (gimple_debug_bind_p (ustmt))
4555                 {
4556                   if (vect_print_dump_info (REPORT_DETAILS))
4557                     fprintf (vect_dump, "killing debug use");
4558
4559                   gimple_debug_bind_reset_value (ustmt);
4560                   update_stmt (ustmt);
4561                 }
4562               else
4563                 gcc_unreachable ();
4564             }
4565         }
4566     }
4567 }
4568
4569 /* Function vect_transform_loop.
4570
4571    The analysis phase has determined that the loop is vectorizable.
4572    Vectorize the loop - created vectorized stmts to replace the scalar
4573    stmts in the loop, and update the loop exit condition.  */
4574
4575 void
4576 vect_transform_loop (loop_vec_info loop_vinfo)
4577 {
4578   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4579   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4580   int nbbs = loop->num_nodes;
4581   gimple_stmt_iterator si;
4582   int i;
4583   tree ratio = NULL;
4584   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4585   bool strided_store;
4586   bool slp_scheduled = false;
4587   unsigned int nunits;
4588   tree cond_expr = NULL_TREE;
4589   gimple_seq cond_expr_stmt_list = NULL;
4590   bool do_peeling_for_loop_bound;
4591
4592   if (vect_print_dump_info (REPORT_DETAILS))
4593     fprintf (vect_dump, "=== vec_transform_loop ===");
4594
4595   /* Peel the loop if there are data refs with unknown alignment.
4596      Only one data ref with unknown store is allowed.  */
4597
4598   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4599     vect_do_peeling_for_alignment (loop_vinfo);
4600
4601   do_peeling_for_loop_bound
4602     = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4603        || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4604            && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0));
4605
4606   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
4607       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4608     vect_loop_versioning (loop_vinfo,
4609                           !do_peeling_for_loop_bound,
4610                           &cond_expr, &cond_expr_stmt_list);
4611
4612   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4613      compile time constant), or it is a constant that doesn't divide by the
4614      vectorization factor, then an epilog loop needs to be created.
4615      We therefore duplicate the loop: the original loop will be vectorized,
4616      and will compute the first (n/VF) iterations. The second copy of the loop
4617      will remain scalar and will compute the remaining (n%VF) iterations.
4618      (VF is the vectorization factor).  */
4619
4620   if (do_peeling_for_loop_bound)
4621     vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
4622                                     cond_expr, cond_expr_stmt_list);
4623   else
4624     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
4625                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
4626
4627   /* 1) Make sure the loop header has exactly two entries
4628      2) Make sure we have a preheader basic block.  */
4629
4630   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
4631
4632   split_edge (loop_preheader_edge (loop));
4633
4634   /* FORNOW: the vectorizer supports only loops which body consist
4635      of one basic block (header + empty latch). When the vectorizer will
4636      support more involved loop forms, the order by which the BBs are
4637      traversed need to be reconsidered.  */
4638
4639   for (i = 0; i < nbbs; i++)
4640     {
4641       basic_block bb = bbs[i];
4642       stmt_vec_info stmt_info;
4643       gimple phi;
4644
4645       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
4646         {
4647           phi = gsi_stmt (si);
4648           if (vect_print_dump_info (REPORT_DETAILS))
4649             {
4650               fprintf (vect_dump, "------>vectorizing phi: ");
4651               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
4652             }
4653           stmt_info = vinfo_for_stmt (phi);
4654           if (!stmt_info)
4655             continue;
4656
4657           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4658             vect_loop_kill_debug_uses (loop, phi);
4659
4660           if (!STMT_VINFO_RELEVANT_P (stmt_info)
4661               && !STMT_VINFO_LIVE_P (stmt_info))
4662             continue;
4663
4664           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
4665                 != (unsigned HOST_WIDE_INT) vectorization_factor)
4666               && vect_print_dump_info (REPORT_DETAILS))
4667             fprintf (vect_dump, "multiple-types.");
4668
4669           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
4670             {
4671               if (vect_print_dump_info (REPORT_DETAILS))
4672                 fprintf (vect_dump, "transform phi.");
4673               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
4674             }
4675         }
4676
4677       for (si = gsi_start_bb (bb); !gsi_end_p (si);)
4678         {
4679           gimple stmt = gsi_stmt (si);
4680           bool is_store;
4681
4682           if (vect_print_dump_info (REPORT_DETAILS))
4683             {
4684               fprintf (vect_dump, "------>vectorizing statement: ");
4685               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
4686             }
4687
4688           stmt_info = vinfo_for_stmt (stmt);
4689
4690           /* vector stmts created in the outer-loop during vectorization of
4691              stmts in an inner-loop may not have a stmt_info, and do not
4692              need to be vectorized.  */
4693           if (!stmt_info)
4694             {
4695               gsi_next (&si);
4696               continue;
4697             }
4698
4699           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4700             vect_loop_kill_debug_uses (loop, stmt);
4701
4702           if (!STMT_VINFO_RELEVANT_P (stmt_info)
4703               && !STMT_VINFO_LIVE_P (stmt_info))
4704             {
4705               gsi_next (&si);
4706               continue;
4707             }
4708
4709           gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
4710           nunits =
4711             (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4712           if (!STMT_SLP_TYPE (stmt_info)
4713               && nunits != (unsigned int) vectorization_factor
4714               && vect_print_dump_info (REPORT_DETAILS))
4715             /* For SLP VF is set according to unrolling factor, and not to
4716                vector size, hence for SLP this print is not valid.  */
4717             fprintf (vect_dump, "multiple-types.");
4718
4719           /* SLP. Schedule all the SLP instances when the first SLP stmt is
4720              reached.  */
4721           if (STMT_SLP_TYPE (stmt_info))
4722             {
4723               if (!slp_scheduled)
4724                 {
4725                   slp_scheduled = true;
4726
4727                   if (vect_print_dump_info (REPORT_DETAILS))
4728                     fprintf (vect_dump, "=== scheduling SLP instances ===");
4729
4730                   vect_schedule_slp (loop_vinfo, NULL);
4731                 }
4732
4733               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
4734               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
4735                 {
4736                   gsi_next (&si);
4737                   continue;
4738                 }
4739             }
4740
4741           /* -------- vectorize statement ------------ */
4742           if (vect_print_dump_info (REPORT_DETAILS))
4743             fprintf (vect_dump, "transform statement.");
4744
4745           strided_store = false;
4746           is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
4747           if (is_store)
4748             {
4749               if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4750                 {
4751                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
4752                      interleaving chain was completed - free all the stores in
4753                      the chain.  */
4754                   vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
4755                   gsi_remove (&si, true);
4756                   continue;
4757                 }
4758               else
4759                 {
4760                   /* Free the attached stmt_vec_info and remove the stmt.  */
4761                   free_stmt_vec_info (stmt);
4762                   gsi_remove (&si, true);
4763                   continue;
4764                 }
4765             }
4766           gsi_next (&si);
4767         }                       /* stmts in BB */
4768     }                           /* BBs in loop */
4769
4770   slpeel_make_loop_iterate_ntimes (loop, ratio);
4771
4772   /* The memory tags and pointers in vectorized statements need to
4773      have their SSA forms updated.  FIXME, why can't this be delayed
4774      until all the loops have been transformed?  */
4775   update_ssa (TODO_update_ssa);
4776
4777   if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4778     fprintf (vect_dump, "LOOP VECTORIZED.");
4779   if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4780     fprintf (vect_dump, "OUTER LOOP VECTORIZED.");
4781 }