gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
   3    Free Software Foundation, Inc.
   4    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   5    Ira Rosen <irar@il.ibm.com>
   6
   7 This file is part of GCC.
   8
   9 GCC is free software; you can redistribute it and/or modify it under
  10 the terms of the GNU General Public License as published by the Free
  11 Software Foundation; either version 3, or (at your option) any later
  12 version.
  13
  14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  16 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  17 for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with GCC; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "tm.h"
  27 #include "ggc.h"
  28 #include "tree.h"
  29 #include "basic-block.h"
  30 #include "diagnostic.h"
  31 #include "tree-flow.h"
  32 #include "tree-dump.h"
  33 #include "cfgloop.h"
  34 #include "cfglayout.h"
  35 #include "expr.h"
  36 #include "recog.h"
  37 #include "optabs.h"
  38 #include "params.h"
  39 #include "toplev.h"
  40 #include "tree-chrec.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43
  44 /* Loop Vectorization Pass.
  45
  46    This pass tries to vectorize loops.
  47
  48    For example, the vectorizer transforms the following simple loop:
  49
  50         short a[N]; short b[N]; short c[N]; int i;
  51
  52         for (i=0; i<N; i++){
  53           a[i] = b[i] + c[i];
  54         }
  55
  56    as if it was manually vectorized by rewriting the source code into:
  57
  58         typedef int __attribute__((mode(V8HI))) v8hi;
  59         short a[N];  short b[N]; short c[N];   int i;
  60         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  61         v8hi va, vb, vc;
  62
  63         for (i=0; i<N/8; i++){
  64           vb = pb[i];
  65           vc = pc[i];
  66           va = vb + vc;
  67           pa[i] = va;
  68         }
  69
  70         The main entry to this pass is vectorize_loops(), in which
  71    the vectorizer applies a set of analyses on a given set of loops,
  72    followed by the actual vectorization transformation for the loops that
  73    had successfully passed the analysis phase.
  74         Throughout this pass we make a distinction between two types of
  75    data: scalars (which are represented by SSA_NAMES), and memory references
  76    ("data-refs"). These two types of data require different handling both
  77    during analysis and transformation. The types of data-refs that the
  78    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  79    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  80    accesses are required to have a simple (consecutive) access pattern.
  81
  82    Analysis phase:
  83    ===============
  84         The driver for the analysis phase is vect_analyze_loop().
  85    It applies a set of analyses, some of which rely on the scalar evolution
  86    analyzer (scev) developed by Sebastian Pop.
  87
  88         During the analysis phase the vectorizer records some information
  89    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  90    loop, as well as general information about the loop as a whole, which is
  91    recorded in a "loop_vec_info" struct attached to each loop.
  92
  93    Transformation phase:
  94    =====================
  95         The loop transformation phase scans all the stmts in the loop, and
  96    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
  97    the loop that needs to be vectorized. It inserts the vector code sequence
  98    just before the scalar stmt S, and records a pointer to the vector code
  99    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 100    attached to S). This pointer will be used for the vectorization of following
 101    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 102    otherwise, we rely on dead code elimination for removing it.
 103
 104         For example, say stmt S1 was vectorized into stmt VS1:
 105
 106    VS1: vb = px[i];
 107    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 108    S2:  a = b;
 109
 110    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 111    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 112    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
 113    resulting sequence would be:
 114
 115    VS1: vb = px[i];
 116    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 117    VS2: va = vb;
 118    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 119
 120         Operands that are not SSA_NAMEs, are data-refs that appear in
 121    load/store operations (like 'x[i]' in S1), and are handled differently.
 122
 123    Target modeling:
 124    =================
 125         Currently the only target specific information that is used is the
 126    size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can
 127    support different sizes of vectors, for now will need to specify one value
 128    for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future.
 129
 130         Since we only vectorize operations which vector form can be
 131    expressed using existing tree codes, to verify that an operation is
 132    supported, the vectorizer checks the relevant optab at the relevant
 133    machine_mode (e.g, optab_handler (add_optab, V8HImode)->insn_code). If
 134    the value found is CODE_FOR_nothing, then there's no target support, and
 135    we can't vectorize the stmt.
 136
 137    For additional information on this project see:
 138    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 139 */
 140
 141 /* Function vect_determine_vectorization_factor
 142
 143    Determine the vectorization factor (VF). VF is the number of data elements
 144    that are operated upon in parallel in a single iteration of the vectorized
 145    loop. For example, when vectorizing a loop that operates on 4byte elements,
 146    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 147    elements can fit in a single vector register.
 148
 149    We currently support vectorization of loops in which all types operated upon
 150    are of the same size. Therefore this function currently sets VF according to
 151    the size of the types operated upon, and fails if there are multiple sizes
 152    in the loop.
 153
 154    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 155    original loop:
 156         for (i=0; i<N; i++){
 157           a[i] = b[i] + c[i];
 158         }
 159
 160    vectorized loop:
 161         for (i=0; i<N; i+=VF){
 162           a[i:VF] = b[i:VF] + c[i:VF];
 163         }
 164 */
 165
 166 static bool
 167 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 168 {
 169   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 170   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 171   int nbbs = loop->num_nodes;
 172   gimple_stmt_iterator si;
 173   unsigned int vectorization_factor = 0;
 174   tree scalar_type;
 175   gimple phi;
 176   tree vectype;
 177   unsigned int nunits;
 178   stmt_vec_info stmt_info;
 179   int i;
 180   HOST_WIDE_INT dummy;
 181
 182   if (vect_print_dump_info (REPORT_DETAILS))
 183     fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
 184
 185   for (i = 0; i < nbbs; i++)
 186     {
 187       basic_block bb = bbs[i];
 188
 189       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 190         {
 191           phi = gsi_stmt (si);
 192           stmt_info = vinfo_for_stmt (phi);
 193           if (vect_print_dump_info (REPORT_DETAILS))
 194             {
 195               fprintf (vect_dump, "==> examining phi: ");
 196               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 197             }
 198
 199           gcc_assert (stmt_info);
 200
 201           if (STMT_VINFO_RELEVANT_P (stmt_info))
 202             {
 203               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 204               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 205
 206               if (vect_print_dump_info (REPORT_DETAILS))
 207                 {
 208                   fprintf (vect_dump, "get vectype for scalar type:  ");
 209                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 210                 }
 211
 212               vectype = get_vectype_for_scalar_type (scalar_type);
 213               if (!vectype)
 214                 {
 215                   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 216                     {
 217                       fprintf (vect_dump,
 218                                "not vectorized: unsupported data-type ");
 219                       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 220                     }
 221                   return false;
 222                 }
 223               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 224
 225               if (vect_print_dump_info (REPORT_DETAILS))
 226                 {
 227                   fprintf (vect_dump, "vectype: ");
 228                   print_generic_expr (vect_dump, vectype, TDF_SLIM);
 229                 }
 230
 231               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 232               if (vect_print_dump_info (REPORT_DETAILS))
 233                 fprintf (vect_dump, "nunits = %d", nunits);
 234
 235               if (!vectorization_factor
 236                   || (nunits > vectorization_factor))
 237                 vectorization_factor = nunits;
 238             }
 239         }
 240
 241       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 242         {
 243           gimple stmt = gsi_stmt (si);
 244           stmt_info = vinfo_for_stmt (stmt);
 245
 246           if (vect_print_dump_info (REPORT_DETAILS))
 247             {
 248               fprintf (vect_dump, "==> examining statement: ");
 249               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 250             }
 251
 252           gcc_assert (stmt_info);
 253
 254           /* skip stmts which do not need to be vectorized.  */
 255           if (!STMT_VINFO_RELEVANT_P (stmt_info)
 256               && !STMT_VINFO_LIVE_P (stmt_info))
 257             {
 258               if (vect_print_dump_info (REPORT_DETAILS))
 259                 fprintf (vect_dump, "skip.");
 260               continue;
 261             }
 262
 263           if (gimple_get_lhs (stmt) == NULL_TREE)
 264             {
 265               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 266                 {
 267                   fprintf (vect_dump, "not vectorized: irregular stmt.");
 268                   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 269                 }
 270               return false;
 271             }
 272
 273           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 274             {
 275               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 276                 {
 277                   fprintf (vect_dump, "not vectorized: vector stmt in loop:");
 278                   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 279                 }
 280               return false;
 281             }
 282
 283           if (STMT_VINFO_VECTYPE (stmt_info))
 284             {
 285               /* The only case when a vectype had been already set is for stmts
 286                  that contain a dataref, or for "pattern-stmts" (stmts generated
 287                  by the vectorizer to represent/replace a certain idiom).  */
 288               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 289                           || is_pattern_stmt_p (stmt_info));
 290               vectype = STMT_VINFO_VECTYPE (stmt_info);
 291             }
 292           else
 293             {
 294               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
 295                           && !is_pattern_stmt_p (stmt_info));
 296
 297               scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 298                                                            &dummy);
 299               if (vect_print_dump_info (REPORT_DETAILS))
 300                 {
 301                   fprintf (vect_dump, "get vectype for scalar type:  ");
 302                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 303                 }
 304
 305               vectype = get_vectype_for_scalar_type (scalar_type);
 306               if (!vectype)
 307                 {
 308                   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 309                     {
 310                       fprintf (vect_dump,
 311                                "not vectorized: unsupported data-type ");
 312                       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 313                     }
 314                   return false;
 315                 }
 316               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 317             }
 318
 319           if (vect_print_dump_info (REPORT_DETAILS))
 320             {
 321               fprintf (vect_dump, "vectype: ");
 322               print_generic_expr (vect_dump, vectype, TDF_SLIM);
 323             }
 324
 325           nunits = TYPE_VECTOR_SUBPARTS (vectype);
 326           if (vect_print_dump_info (REPORT_DETAILS))
 327             fprintf (vect_dump, "nunits = %d", nunits);
 328
 329           if (!vectorization_factor
 330               || (nunits > vectorization_factor))
 331             vectorization_factor = nunits;
 332
 333         }
 334     }
 335
 336   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 337   if (vect_print_dump_info (REPORT_DETAILS))
 338     fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
 339   if (vectorization_factor <= 1)
 340     {
 341       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 342         fprintf (vect_dump, "not vectorized: unsupported data-type");
 343       return false;
 344     }
 345   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 346
 347   return true;
 348 }
 349
 350
 351 /* Function vect_is_simple_iv_evolution.
 352
 353    FORNOW: A simple evolution of an induction variables in the loop is
 354    considered a polynomial evolution with constant step.  */
 355
 356 static bool
 357 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 358                              tree * step)
 359 {
 360   tree init_expr;
 361   tree step_expr;
 362   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 363
 364   /* When there is no evolution in this loop, the evolution function
 365      is not "simple".  */
 366   if (evolution_part == NULL_TREE)
 367     return false;
 368
 369   /* When the evolution is a polynomial of degree >= 2
 370      the evolution function is not "simple".  */
 371   if (tree_is_chrec (evolution_part))
 372     return false;
 373
 374   step_expr = evolution_part;
 375   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 376
 377   if (vect_print_dump_info (REPORT_DETAILS))
 378     {
 379       fprintf (vect_dump, "step: ");
 380       print_generic_expr (vect_dump, step_expr, TDF_SLIM);
 381       fprintf (vect_dump, ",  init: ");
 382       print_generic_expr (vect_dump, init_expr, TDF_SLIM);
 383     }
 384
 385   *init = init_expr;
 386   *step = step_expr;
 387
 388   if (TREE_CODE (step_expr) != INTEGER_CST)
 389     {
 390       if (vect_print_dump_info (REPORT_DETAILS))
 391         fprintf (vect_dump, "step unknown.");
 392       return false;
 393     }
 394
 395   return true;
 396 }
 397
 398 /* Function vect_analyze_scalar_cycles_1.
 399
 400    Examine the cross iteration def-use cycles of scalar variables
 401    in LOOP. LOOP_VINFO represents the loop that is now being
 402    considered for vectorization (can be LOOP, or an outer-loop
 403    enclosing LOOP).  */
 404
 405 static void
 406 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 407 {
 408   basic_block bb = loop->header;
 409   tree dumy;
 410   VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
 411   gimple_stmt_iterator gsi;
 412   bool double_reduc;
 413
 414   if (vect_print_dump_info (REPORT_DETAILS))
 415     fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
 416
 417   /* First - identify all inductions. Reduction detection assumes that all the
 418      inductions have been identified, therefore, this order must not be
 419      changed.  */
 420   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 421     {
 422       gimple phi = gsi_stmt (gsi);
 423       tree access_fn = NULL;
 424       tree def = PHI_RESULT (phi);
 425       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 426
 427       if (vect_print_dump_info (REPORT_DETAILS))
 428         {
 429           fprintf (vect_dump, "Analyze phi: ");
 430           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 431         }
 432
 433       /* Skip virtual phi's. The data dependences that are associated with
 434          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 435       if (!is_gimple_reg (SSA_NAME_VAR (def)))
 436         continue;
 437
 438       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 439
 440       /* Analyze the evolution function.  */
 441       access_fn = analyze_scalar_evolution (loop, def);
 442       if (access_fn && vect_print_dump_info (REPORT_DETAILS))
 443         {
 444           fprintf (vect_dump, "Access function of PHI: ");
 445           print_generic_expr (vect_dump, access_fn, TDF_SLIM);
 446         }
 447
 448       if (!access_fn
 449           || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
 450         {
 451           VEC_safe_push (gimple, heap, worklist, phi);
 452           continue;
 453         }
 454
 455       if (vect_print_dump_info (REPORT_DETAILS))
 456         fprintf (vect_dump, "Detected induction.");
 457       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 458     }
 459
 460
 461   /* Second - identify all reductions and nested cycles.  */
 462   while (VEC_length (gimple, worklist) > 0)
 463     {
 464       gimple phi = VEC_pop (gimple, worklist);
 465       tree def = PHI_RESULT (phi);
 466       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 467       gimple reduc_stmt;
 468       bool nested_cycle;
 469
 470       if (vect_print_dump_info (REPORT_DETAILS))
 471         {
 472           fprintf (vect_dump, "Analyze phi: ");
 473           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 474         }
 475
 476       gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
 477       gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 478
 479       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 480       reduc_stmt = vect_is_simple_reduction (loop_vinfo, phi, !nested_cycle,
 481                                              &double_reduc);
 482       if (reduc_stmt)
 483         {
 484           if (double_reduc)
 485             {
 486               if (vect_print_dump_info (REPORT_DETAILS))
 487                 fprintf (vect_dump, "Detected double reduction.");
 488
 489               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 490               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 491                                                     vect_double_reduction_def;
 492             }
 493           else
 494             {
 495               if (nested_cycle)
 496                 {
 497                   if (vect_print_dump_info (REPORT_DETAILS))
 498                     fprintf (vect_dump, "Detected vectorizable nested cycle.");
 499
 500                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 501                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 502                                                              vect_nested_cycle;
 503                 }
 504               else
 505                 {
 506                   if (vect_print_dump_info (REPORT_DETAILS))
 507                     fprintf (vect_dump, "Detected reduction.");
 508
 509                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 510                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 511                                                            vect_reduction_def;
 512                 }
 513             }
 514         }
 515       else
 516         if (vect_print_dump_info (REPORT_DETAILS))
 517           fprintf (vect_dump, "Unknown def-use cycle pattern.");
 518     }
 519
 520   VEC_free (gimple, heap, worklist);
 521 }
 522
 523
 524 /* Function vect_analyze_scalar_cycles.
 525
 526    Examine the cross iteration def-use cycles of scalar variables, by
 527    analyzing the loop-header PHIs of scalar variables; Classify each
 528    cycle as one of the following: invariant, induction, reduction, unknown.
 529    We do that for the loop represented by LOOP_VINFO, and also to its
 530    inner-loop, if exists.
 531    Examples for scalar cycles:
 532
 533    Example1: reduction:
 534
 535               loop1:
 536               for (i=0; i<N; i++)
 537                  sum += a[i];
 538
 539    Example2: induction:
 540
 541               loop2:
 542               for (i=0; i<N; i++)
 543                  a[i] = i;  */
 544
 545 static void
 546 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 547 {
 548   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 549
 550   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 551
 552   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 553      Reductions in such inner-loop therefore have different properties than
 554      the reductions in the nest that gets vectorized:
 555      1. When vectorized, they are executed in the same order as in the original
 556         scalar loop, so we can't change the order of computation when
 557         vectorizing them.
 558      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 559         current checks are too strict.  */
 560
 561   if (loop->inner)
 562     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 563 }
 564
 565 /* Function vect_get_loop_niters.
 566
 567    Determine how many iterations the loop is executed.
 568    If an expression that represents the number of iterations
 569    can be constructed, place it in NUMBER_OF_ITERATIONS.
 570    Return the loop exit condition.  */
 571
 572 static gimple
 573 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 574 {
 575   tree niters;
 576
 577   if (vect_print_dump_info (REPORT_DETAILS))
 578     fprintf (vect_dump, "=== get_loop_niters ===");
 579
 580   niters = number_of_exit_cond_executions (loop);
 581
 582   if (niters != NULL_TREE
 583       && niters != chrec_dont_know)
 584     {
 585       *number_of_iterations = niters;
 586
 587       if (vect_print_dump_info (REPORT_DETAILS))
 588         {
 589           fprintf (vect_dump, "==> get_loop_niters:" );
 590           print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
 591         }
 592     }
 593
 594   return get_loop_exit_condition (loop);
 595 }
 596
 597
 598 /* Function bb_in_loop_p
 599
 600    Used as predicate for dfs order traversal of the loop bbs.  */
 601
 602 static bool
 603 bb_in_loop_p (const_basic_block bb, const void *data)
 604 {
 605   const struct loop *const loop = (const struct loop *)data;
 606   if (flow_bb_inside_loop_p (loop, bb))
 607     return true;
 608   return false;
 609 }
 610
 611
 612 /* Function new_loop_vec_info.
 613
 614    Create and initialize a new loop_vec_info struct for LOOP, as well as
 615    stmt_vec_info structs for all the stmts in LOOP.  */
 616
 617 static loop_vec_info
 618 new_loop_vec_info (struct loop *loop)
 619 {
 620   loop_vec_info res;
 621   basic_block *bbs;
 622   gimple_stmt_iterator si;
 623   unsigned int i, nbbs;
 624
 625   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 626   LOOP_VINFO_LOOP (res) = loop;
 627
 628   bbs = get_loop_body (loop);
 629
 630   /* Create/Update stmt_info for all stmts in the loop.  */
 631   for (i = 0; i < loop->num_nodes; i++)
 632     {
 633       basic_block bb = bbs[i];
 634
 635       /* BBs in a nested inner-loop will have been already processed (because
 636          we will have called vect_analyze_loop_form for any nested inner-loop).
 637          Therefore, for stmts in an inner-loop we just want to update the
 638          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 639          loop_info of the outer-loop we are currently considering to vectorize
 640          (instead of the loop_info of the inner-loop).
 641          For stmts in other BBs we need to create a stmt_info from scratch.  */
 642       if (bb->loop_father != loop)
 643         {
 644           /* Inner-loop bb.  */
 645           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 646           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 647             {
 648               gimple phi = gsi_stmt (si);
 649               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 650               loop_vec_info inner_loop_vinfo =
 651                 STMT_VINFO_LOOP_VINFO (stmt_info);
 652               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 653               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 654             }
 655           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 656            {
 657               gimple stmt = gsi_stmt (si);
 658               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 659               loop_vec_info inner_loop_vinfo =
 660                  STMT_VINFO_LOOP_VINFO (stmt_info);
 661               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 662               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 663            }
 664         }
 665       else
 666         {
 667           /* bb in current nest.  */
 668           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 669             {
 670               gimple phi = gsi_stmt (si);
 671               gimple_set_uid (phi, 0);
 672               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 673             }
 674
 675           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 676             {
 677               gimple stmt = gsi_stmt (si);
 678               gimple_set_uid (stmt, 0);
 679               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 680             }
 681         }
 682     }
 683
 684   /* CHECKME: We want to visit all BBs before their successors (except for
 685      latch blocks, for which this assertion wouldn't hold).  In the simple
 686      case of the loop forms we allow, a dfs order of the BBs would the same
 687      as reversed postorder traversal, so we are safe.  */
 688
 689    free (bbs);
 690    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 691    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 692                               bbs, loop->num_nodes, loop);
 693    gcc_assert (nbbs == loop->num_nodes);
 694
 695   LOOP_VINFO_BBS (res) = bbs;
 696   LOOP_VINFO_NITERS (res) = NULL;
 697   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 698   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 699   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 700   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
 701   LOOP_VINFO_VECT_FACTOR (res) = 0;
 702   LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
 703   LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
 704   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 705   LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
 706     VEC_alloc (gimple, heap,
 707                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 708   LOOP_VINFO_MAY_ALIAS_DDRS (res) =
 709     VEC_alloc (ddr_p, heap,
 710                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 711   LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
 712   LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
 713   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 714
 715   return res;
 716 }
 717
 718
 719 /* Function destroy_loop_vec_info.
 720
 721    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 722    stmts in the loop.  */
 723
 724 void
 725 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 726 {
 727   struct loop *loop;
 728   basic_block *bbs;
 729   int nbbs;
 730   gimple_stmt_iterator si;
 731   int j;
 732   VEC (slp_instance, heap) *slp_instances;
 733   slp_instance instance;
 734
 735   if (!loop_vinfo)
 736     return;
 737
 738   loop = LOOP_VINFO_LOOP (loop_vinfo);
 739
 740   bbs = LOOP_VINFO_BBS (loop_vinfo);
 741   nbbs = loop->num_nodes;
 742
 743   if (!clean_stmts)
 744     {
 745       free (LOOP_VINFO_BBS (loop_vinfo));
 746       free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 747       free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 748       VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 749
 750       free (loop_vinfo);
 751       loop->aux = NULL;
 752       return;
 753     }
 754
 755   for (j = 0; j < nbbs; j++)
 756     {
 757       basic_block bb = bbs[j];
 758       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 759         free_stmt_vec_info (gsi_stmt (si));
 760
 761       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 762         {
 763           gimple stmt = gsi_stmt (si);
 764           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 765
 766           if (stmt_info)
 767             {
 768               /* Check if this is a "pattern stmt" (introduced by the
 769                  vectorizer during the pattern recognition pass).  */
 770               bool remove_stmt_p = false;
 771               gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
 772               if (orig_stmt)
 773                 {
 774                   stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
 775                   if (orig_stmt_info
 776                       && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
 777                     remove_stmt_p = true;
 778                 }
 779
 780               /* Free stmt_vec_info.  */
 781               free_stmt_vec_info (stmt);
 782
 783               /* Remove dead "pattern stmts".  */
 784               if (remove_stmt_p)
 785                 gsi_remove (&si, true);
 786             }
 787           gsi_next (&si);
 788         }
 789     }
 790
 791   free (LOOP_VINFO_BBS (loop_vinfo));
 792   free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 793   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 794   VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 795   VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
 796   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
 797   for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
 798     vect_free_slp_instance (instance);
 799
 800   VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
 801   VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
 802
 803   free (loop_vinfo);
 804   loop->aux = NULL;
 805 }
 806
 807
 808 /* Function vect_analyze_loop_1.
 809
 810    Apply a set of analyses on LOOP, and create a loop_vec_info struct
 811    for it. The different analyses will record information in the
 812    loop_vec_info struct.  This is a subset of the analyses applied in
 813    vect_analyze_loop, to be applied on an inner-loop nested in the loop
 814    that is now considered for (outer-loop) vectorization.  */
 815
 816 static loop_vec_info
 817 vect_analyze_loop_1 (struct loop *loop)
 818 {
 819   loop_vec_info loop_vinfo;
 820
 821   if (vect_print_dump_info (REPORT_DETAILS))
 822     fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
 823
 824   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
 825
 826   loop_vinfo = vect_analyze_loop_form (loop);
 827   if (!loop_vinfo)
 828     {
 829       if (vect_print_dump_info (REPORT_DETAILS))
 830         fprintf (vect_dump, "bad inner-loop form.");
 831       return NULL;
 832     }
 833
 834   return loop_vinfo;
 835 }
 836
 837
 838 /* Function vect_analyze_loop_form.
 839
 840    Verify that certain CFG restrictions hold, including:
 841    - the loop has a pre-header
 842    - the loop has a single entry and exit
 843    - the loop exit condition is simple enough, and the number of iterations
 844      can be analyzed (a countable loop).  */
 845
 846 loop_vec_info
 847 vect_analyze_loop_form (struct loop *loop)
 848 {
 849   loop_vec_info loop_vinfo;
 850   gimple loop_cond;
 851   tree number_of_iterations = NULL;
 852   loop_vec_info inner_loop_vinfo = NULL;
 853
 854   if (vect_print_dump_info (REPORT_DETAILS))
 855     fprintf (vect_dump, "=== vect_analyze_loop_form ===");
 856
 857   /* Different restrictions apply when we are considering an inner-most loop,
 858      vs. an outer (nested) loop.
 859      (FORNOW. May want to relax some of these restrictions in the future).  */
 860
 861   if (!loop->inner)
 862     {
 863       /* Inner-most loop.  We currently require that the number of BBs is
 864          exactly 2 (the header and latch).  Vectorizable inner-most loops
 865          look like this:
 866
 867                         (pre-header)
 868                            |
 869                           header <--------+
 870                            | |            |
 871                            | +--> latch --+
 872                            |
 873                         (exit-bb)  */
 874
 875       if (loop->num_nodes != 2)
 876         {
 877           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 878             fprintf (vect_dump, "not vectorized: control flow in loop.");
 879           return NULL;
 880         }
 881
 882       if (empty_block_p (loop->header))
 883     {
 884           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 885             fprintf (vect_dump, "not vectorized: empty loop.");
 886       return NULL;
 887     }
 888     }
 889   else
 890     {
 891       struct loop *innerloop = loop->inner;
 892       edge entryedge;
 893
 894       /* Nested loop. We currently require that the loop is doubly-nested,
 895          contains a single inner loop, and the number of BBs is exactly 5.
 896          Vectorizable outer-loops look like this:
 897
 898                         (pre-header)
 899                            |
 900                           header <---+
 901                            |         |
 902                           inner-loop |
 903                            |         |
 904                           tail ------+
 905                            |
 906                         (exit-bb)
 907
 908          The inner-loop has the properties expected of inner-most loops
 909          as described above.  */
 910
 911       if ((loop->inner)->inner || (loop->inner)->next)
 912         {
 913           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 914             fprintf (vect_dump, "not vectorized: multiple nested loops.");
 915           return NULL;
 916         }
 917
 918       /* Analyze the inner-loop.  */
 919       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
 920       if (!inner_loop_vinfo)
 921         {
 922           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 923             fprintf (vect_dump, "not vectorized: Bad inner loop.");
 924           return NULL;
 925         }
 926
 927       if (!expr_invariant_in_loop_p (loop,
 928                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
 929         {
 930           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 931             fprintf (vect_dump,
 932                      "not vectorized: inner-loop count not invariant.");
 933           destroy_loop_vec_info (inner_loop_vinfo, true);
 934           return NULL;
 935         }
 936
 937       if (loop->num_nodes != 5)
 938         {
 939           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 940             fprintf (vect_dump, "not vectorized: control flow in loop.");
 941           destroy_loop_vec_info (inner_loop_vinfo, true);
 942           return NULL;
 943         }
 944
 945       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
 946       entryedge = EDGE_PRED (innerloop->header, 0);
 947       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
 948         entryedge = EDGE_PRED (innerloop->header, 1);
 949
 950       if (entryedge->src != loop->header
 951           || !single_exit (innerloop)
 952           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
 953         {
 954           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 955             fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
 956           destroy_loop_vec_info (inner_loop_vinfo, true);
 957           return NULL;
 958         }
 959
 960       if (vect_print_dump_info (REPORT_DETAILS))
 961         fprintf (vect_dump, "Considering outer-loop vectorization.");
 962     }
 963
 964   if (!single_exit (loop)
 965       || EDGE_COUNT (loop->header->preds) != 2)
 966     {
 967       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 968         {
 969           if (!single_exit (loop))
 970             fprintf (vect_dump, "not vectorized: multiple exits.");
 971           else if (EDGE_COUNT (loop->header->preds) != 2)
 972             fprintf (vect_dump, "not vectorized: too many incoming edges.");
 973         }
 974       if (inner_loop_vinfo)
 975         destroy_loop_vec_info (inner_loop_vinfo, true);
 976       return NULL;
 977     }
 978
 979   /* We assume that the loop exit condition is at the end of the loop. i.e,
 980      that the loop is represented as a do-while (with a proper if-guard
 981      before the loop if needed), where the loop header contains all the
 982      executable statements, and the latch is empty.  */
 983   if (!empty_block_p (loop->latch)
 984         || !gimple_seq_empty_p (phi_nodes (loop->latch)))
 985     {
 986       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 987         fprintf (vect_dump, "not vectorized: unexpected loop form.");
 988       if (inner_loop_vinfo)
 989         destroy_loop_vec_info (inner_loop_vinfo, true);
 990       return NULL;
 991     }
 992
 993   /* Make sure there exists a single-predecessor exit bb:  */
 994   if (!single_pred_p (single_exit (loop)->dest))
 995     {
 996       edge e = single_exit (loop);
 997       if (!(e->flags & EDGE_ABNORMAL))
 998         {
 999           split_loop_exit_edge (e);
1000           if (vect_print_dump_info (REPORT_DETAILS))
1001             fprintf (vect_dump, "split exit edge.");
1002         }
1003       else
1004         {
1005           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1006             fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1007           if (inner_loop_vinfo)
1008             destroy_loop_vec_info (inner_loop_vinfo, true);
1009           return NULL;
1010         }
1011     }
1012
1013   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1014   if (!loop_cond)
1015     {
1016       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1017         fprintf (vect_dump, "not vectorized: complicated exit condition.");
1018       if (inner_loop_vinfo)
1019         destroy_loop_vec_info (inner_loop_vinfo, true);
1020       return NULL;
1021     }
1022
1023   if (!number_of_iterations)
1024     {
1025       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1026         fprintf (vect_dump,
1027                  "not vectorized: number of iterations cannot be computed.");
1028       if (inner_loop_vinfo)
1029         destroy_loop_vec_info (inner_loop_vinfo, true);
1030       return NULL;
1031     }
1032
1033   if (chrec_contains_undetermined (number_of_iterations))
1034     {
1035       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1036         fprintf (vect_dump, "Infinite number of iterations.");
1037       if (inner_loop_vinfo)
1038         destroy_loop_vec_info (inner_loop_vinfo, true);
1039       return NULL;
1040     }
1041
1042   if (!NITERS_KNOWN_P (number_of_iterations))
1043     {
1044       if (vect_print_dump_info (REPORT_DETAILS))
1045         {
1046           fprintf (vect_dump, "Symbolic number of iterations is ");
1047           print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1048         }
1049     }
1050   else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1051     {
1052       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1053         fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1054       if (inner_loop_vinfo)
1055         destroy_loop_vec_info (inner_loop_vinfo, false);
1056       return NULL;
1057     }
1058
1059   loop_vinfo = new_loop_vec_info (loop);
1060   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1061   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1062
1063   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1064
1065   /* CHECKME: May want to keep it around it in the future.  */
1066   if (inner_loop_vinfo)
1067     destroy_loop_vec_info (inner_loop_vinfo, false);
1068
1069   gcc_assert (!loop->aux);
1070   loop->aux = loop_vinfo;
1071   return loop_vinfo;
1072 }
1073
1074
1075 /* Function vect_analyze_loop_operations.
1076
1077    Scan the loop stmts and make sure they are all vectorizable.  */
1078
1079 static bool
1080 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1081 {
1082   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1083   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1084   int nbbs = loop->num_nodes;
1085   gimple_stmt_iterator si;
1086   unsigned int vectorization_factor = 0;
1087   int i;
1088   gimple phi;
1089   stmt_vec_info stmt_info;
1090   bool need_to_vectorize = false;
1091   int min_profitable_iters;
1092   int min_scalar_loop_bound;
1093   unsigned int th;
1094   bool only_slp_in_loop = true, ok;
1095
1096   if (vect_print_dump_info (REPORT_DETAILS))
1097     fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1098
1099   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1100   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1101
1102   for (i = 0; i < nbbs; i++)
1103     {
1104       basic_block bb = bbs[i];
1105
1106       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1107         {
1108           phi = gsi_stmt (si);
1109           ok = true;
1110
1111           stmt_info = vinfo_for_stmt (phi);
1112           if (vect_print_dump_info (REPORT_DETAILS))
1113             {
1114               fprintf (vect_dump, "examining phi: ");
1115               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1116             }
1117
1118           if (! is_loop_header_bb_p (bb))
1119             {
1120               /* inner-loop loop-closed exit phi in outer-loop vectorization
1121                  (i.e. a phi in the tail of the outer-loop).
1122                  FORNOW: we currently don't support the case that these phis
1123                  are not used in the outerloop (unless it is double reduction,
1124                  i.e., this phi is vect_reduction_def), cause this case
1125                  requires to actually do something here.  */
1126               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1127                    || STMT_VINFO_LIVE_P (stmt_info))
1128                   && STMT_VINFO_DEF_TYPE (stmt_info)
1129                      != vect_double_reduction_def)
1130                 {
1131                   if (vect_print_dump_info (REPORT_DETAILS))
1132                     fprintf (vect_dump,
1133                              "Unsupported loop-closed phi in outer-loop.");
1134                   return false;
1135                 }
1136               continue;
1137             }
1138
1139           gcc_assert (stmt_info);
1140
1141           if (STMT_VINFO_LIVE_P (stmt_info))
1142             {
1143               /* FORNOW: not yet supported.  */
1144               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1145                 fprintf (vect_dump, "not vectorized: value used after loop.");
1146               return false;
1147             }
1148
1149           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1150               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1151             {
1152               /* A scalar-dependence cycle that we don't support.  */
1153               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1154                 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1155               return false;
1156             }
1157
1158           if (STMT_VINFO_RELEVANT_P (stmt_info))
1159             {
1160               need_to_vectorize = true;
1161               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1162                 ok = vectorizable_induction (phi, NULL, NULL);
1163             }
1164
1165           if (!ok)
1166             {
1167               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1168                 {
1169                   fprintf (vect_dump,
1170                            "not vectorized: relevant phi not supported: ");
1171                   print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1172                 }
1173               return false;
1174             }
1175         }
1176
1177       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1178         {
1179           gimple stmt = gsi_stmt (si);
1180           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1181
1182           gcc_assert (stmt_info);
1183
1184           if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1185             return false;
1186
1187           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1188                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1189               && !PURE_SLP_STMT (stmt_info))
1190
1191             /* STMT needs both SLP and loop-based vectorization.  */
1192             only_slp_in_loop = false;
1193         }
1194     } /* bbs */
1195
1196   /* All operations in the loop are either irrelevant (deal with loop
1197      control, or dead), or only used outside the loop and can be moved
1198      out of the loop (e.g. invariants, inductions).  The loop can be
1199      optimized away by scalar optimizations.  We're better off not
1200      touching this loop.  */
1201   if (!need_to_vectorize)
1202     {
1203       if (vect_print_dump_info (REPORT_DETAILS))
1204         fprintf (vect_dump,
1205                  "All the computation can be taken out of the loop.");
1206       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1207         fprintf (vect_dump,
1208                  "not vectorized: redundant loop. no profit to vectorize.");
1209       return false;
1210     }
1211
1212   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1213      vectorization factor of the loop is the unrolling factor required by the
1214      SLP instances.  If that unrolling factor is 1, we say, that we perform
1215      pure SLP on loop - cross iteration parallelism is not exploited.  */
1216   if (only_slp_in_loop)
1217     vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1218   else
1219     vectorization_factor = least_common_multiple (vectorization_factor,
1220                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1221
1222   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1223
1224   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1225       && vect_print_dump_info (REPORT_DETAILS))
1226     fprintf (vect_dump,
1227         "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1228         vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1229
1230   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1231       && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1232     {
1233       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1234         fprintf (vect_dump, "not vectorized: iteration count too small.");
1235       if (vect_print_dump_info (REPORT_DETAILS))
1236         fprintf (vect_dump,"not vectorized: iteration count smaller than "
1237                  "vectorization factor.");
1238       return false;
1239     }
1240
1241   /* Analyze cost. Decide if worth while to vectorize.  */
1242
1243   /* Once VF is set, SLP costs should be updated since the number of created
1244      vector stmts depends on VF.  */
1245   vect_update_slp_costs_according_to_vf (loop_vinfo);
1246
1247   min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1248   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1249
1250   if (min_profitable_iters < 0)
1251     {
1252       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1253         fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1254       if (vect_print_dump_info (REPORT_DETAILS))
1255         fprintf (vect_dump, "not vectorized: vector version will never be "
1256                  "profitable.");
1257       return false;
1258     }
1259
1260   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1261                             * vectorization_factor) - 1);
1262
1263   /* Use the cost model only if it is more conservative than user specified
1264      threshold.  */
1265
1266   th = (unsigned) min_scalar_loop_bound;
1267   if (min_profitable_iters
1268       && (!min_scalar_loop_bound
1269           || min_profitable_iters > min_scalar_loop_bound))
1270     th = (unsigned) min_profitable_iters;
1271
1272   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1273       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1274     {
1275       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1276         fprintf (vect_dump, "not vectorized: vectorization not "
1277                  "profitable.");
1278       if (vect_print_dump_info (REPORT_DETAILS))
1279         fprintf (vect_dump, "not vectorized: iteration count smaller than "
1280                  "user specified loop bound parameter or minimum "
1281                  "profitable iterations (whichever is more conservative).");
1282       return false;
1283     }
1284
1285   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1286       || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1287       || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1288     {
1289       if (vect_print_dump_info (REPORT_DETAILS))
1290         fprintf (vect_dump, "epilog loop required.");
1291       if (!vect_can_advance_ivs_p (loop_vinfo))
1292         {
1293           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1294             fprintf (vect_dump,
1295                      "not vectorized: can't create epilog loop 1.");
1296           return false;
1297         }
1298       if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1299         {
1300           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1301             fprintf (vect_dump,
1302                      "not vectorized: can't create epilog loop 2.");
1303           return false;
1304         }
1305     }
1306
1307   return true;
1308 }
1309
1310
1311 /* Function vect_analyze_loop.
1312
1313    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1314    for it. The different analyses will record information in the
1315    loop_vec_info struct.  */
1316 loop_vec_info
1317 vect_analyze_loop (struct loop *loop)
1318 {
1319   bool ok;
1320   loop_vec_info loop_vinfo;
1321
1322   if (vect_print_dump_info (REPORT_DETAILS))
1323     fprintf (vect_dump, "===== analyze_loop_nest =====");
1324
1325   if (loop_outer (loop)
1326       && loop_vec_info_for_loop (loop_outer (loop))
1327       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1328     {
1329       if (vect_print_dump_info (REPORT_DETAILS))
1330         fprintf (vect_dump, "outer-loop already vectorized.");
1331       return NULL;
1332     }
1333
1334   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1335
1336   loop_vinfo = vect_analyze_loop_form (loop);
1337   if (!loop_vinfo)
1338     {
1339       if (vect_print_dump_info (REPORT_DETAILS))
1340         fprintf (vect_dump, "bad loop form.");
1341       return NULL;
1342     }
1343
1344   /* Find all data references in the loop (which correspond to vdefs/vuses)
1345      and analyze their evolution in the loop.
1346
1347      FORNOW: Handle only simple, array references, which
1348      alignment can be forced, and aligned pointer-references.  */
1349
1350   ok = vect_analyze_data_refs (loop_vinfo, NULL);
1351   if (!ok)
1352     {
1353       if (vect_print_dump_info (REPORT_DETAILS))
1354         fprintf (vect_dump, "bad data references.");
1355       destroy_loop_vec_info (loop_vinfo, true);
1356       return NULL;
1357     }
1358
1359   /* Classify all cross-iteration scalar data-flow cycles.
1360      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1361
1362   vect_analyze_scalar_cycles (loop_vinfo);
1363
1364   vect_pattern_recog (loop_vinfo);
1365
1366   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1367
1368   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1369   if (!ok)
1370     {
1371       if (vect_print_dump_info (REPORT_DETAILS))
1372         fprintf (vect_dump, "unexpected pattern.");
1373       destroy_loop_vec_info (loop_vinfo, true);
1374       return NULL;
1375     }
1376
1377   /* Analyze the alignment of the data-refs in the loop.
1378      Fail if a data reference is found that cannot be vectorized.  */
1379
1380   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1381   if (!ok)
1382     {
1383       if (vect_print_dump_info (REPORT_DETAILS))
1384         fprintf (vect_dump, "bad data alignment.");
1385       destroy_loop_vec_info (loop_vinfo, true);
1386       return NULL;
1387     }
1388
1389   ok = vect_determine_vectorization_factor (loop_vinfo);
1390   if (!ok)
1391     {
1392       if (vect_print_dump_info (REPORT_DETAILS))
1393         fprintf (vect_dump, "can't determine vectorization factor.");
1394       destroy_loop_vec_info (loop_vinfo, true);
1395       return NULL;
1396     }
1397
1398   /* Analyze data dependences between the data-refs in the loop.
1399      FORNOW: fail at the first data dependence that we encounter.  */
1400
1401   ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL);
1402   if (!ok)
1403     {
1404       if (vect_print_dump_info (REPORT_DETAILS))
1405         fprintf (vect_dump, "bad data dependence.");
1406       destroy_loop_vec_info (loop_vinfo, true);
1407       return NULL;
1408     }
1409
1410   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1411      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1412
1413   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1414   if (!ok)
1415     {
1416       if (vect_print_dump_info (REPORT_DETAILS))
1417         fprintf (vect_dump, "bad data access.");
1418       destroy_loop_vec_info (loop_vinfo, true);
1419       return NULL;
1420     }
1421
1422   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1423      It is important to call pruning after vect_analyze_data_ref_accesses,
1424      since we use grouping information gathered by interleaving analysis.  */
1425   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1426   if (!ok)
1427     {
1428       if (vect_print_dump_info (REPORT_DETAILS))
1429         fprintf (vect_dump, "too long list of versioning for alias "
1430                             "run-time tests.");
1431       destroy_loop_vec_info (loop_vinfo, true);
1432       return NULL;
1433     }
1434
1435   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1436   ok = vect_analyze_slp (loop_vinfo, NULL);
1437   if (ok)
1438     {
1439       /* Decide which possible SLP instances to SLP.  */
1440       vect_make_slp_decision (loop_vinfo);
1441
1442       /* Find stmts that need to be both vectorized and SLPed.  */
1443       vect_detect_hybrid_slp (loop_vinfo);
1444     }
1445
1446   /* This pass will decide on using loop versioning and/or loop peeling in
1447      order to enhance the alignment of data references in the loop.  */
1448
1449   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1450   if (!ok)
1451     {
1452       if (vect_print_dump_info (REPORT_DETAILS))
1453         fprintf (vect_dump, "bad data alignment.");
1454       destroy_loop_vec_info (loop_vinfo, true);
1455       return NULL;
1456     }
1457
1458   /* Scan all the operations in the loop and make sure they are
1459      vectorizable.  */
1460
1461   ok = vect_analyze_loop_operations (loop_vinfo);
1462   if (!ok)
1463     {
1464       if (vect_print_dump_info (REPORT_DETAILS))
1465         fprintf (vect_dump, "bad operation or unsupported loop bound.");
1466       destroy_loop_vec_info (loop_vinfo, true);
1467       return NULL;
1468     }
1469
1470   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1471
1472   return loop_vinfo;
1473 }
1474
1475
1476 /* Function reduction_code_for_scalar_code
1477
1478    Input:
1479    CODE - tree_code of a reduction operations.
1480
1481    Output:
1482    REDUC_CODE - the corresponding tree-code to be used to reduce the
1483       vector of partial results into a single scalar result (which
1484       will also reside in a vector) or ERROR_MARK if the operation is
1485       a supported reduction operation, but does not have such tree-code.
1486
1487    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1488
1489 static bool
1490 reduction_code_for_scalar_code (enum tree_code code,
1491                                 enum tree_code *reduc_code)
1492 {
1493   switch (code)
1494     {
1495       case MAX_EXPR:
1496         *reduc_code = REDUC_MAX_EXPR;
1497         return true;
1498
1499       case MIN_EXPR:
1500         *reduc_code = REDUC_MIN_EXPR;
1501         return true;
1502
1503       case PLUS_EXPR:
1504         *reduc_code = REDUC_PLUS_EXPR;
1505         return true;
1506
1507       case MULT_EXPR:
1508       case MINUS_EXPR:
1509       case BIT_IOR_EXPR:
1510       case BIT_XOR_EXPR:
1511       case BIT_AND_EXPR:
1512         *reduc_code = ERROR_MARK;
1513         return true;
1514
1515       default:
1516        return false;
1517     }
1518 }
1519
1520
1521 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1522    STMT is printed with a message MSG. */
1523
1524 static void
1525 report_vect_op (gimple stmt, const char *msg)
1526 {
1527   fprintf (vect_dump, "%s", msg);
1528   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1529 }
1530
1531
1532 /* Function vect_is_simple_reduction
1533
1534    (1) Detect a cross-iteration def-use cycle that represents a simple
1535    reduction computation. We look for the following pattern:
1536
1537    loop_header:
1538      a1 = phi < a0, a2 >
1539      a3 = ...
1540      a2 = operation (a3, a1)
1541
1542    such that:
1543    1. operation is commutative and associative and it is safe to
1544       change the order of the computation (if CHECK_REDUCTION is true)
1545    2. no uses for a2 in the loop (a2 is used out of the loop)
1546    3. no uses of a1 in the loop besides the reduction operation.
1547
1548    Condition 1 is tested here.
1549    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1550
1551    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1552    nested cycles, if CHECK_REDUCTION is false.
1553
1554    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1555    reductions:
1556
1557      a1 = phi < a0, a2 >
1558      inner loop (def of a3)
1559      a2 = phi < a3 >
1560 */
1561
1562 gimple
1563 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
1564                           bool check_reduction, bool *double_reduc)
1565 {
1566   struct loop *loop = (gimple_bb (phi))->loop_father;
1567   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1568   edge latch_e = loop_latch_edge (loop);
1569   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1570   gimple def_stmt, def1 = NULL, def2 = NULL;
1571   enum tree_code code;
1572   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1573   tree type;
1574   int nloop_uses;
1575   tree name;
1576   imm_use_iterator imm_iter;
1577   use_operand_p use_p;
1578   bool phi_def;
1579
1580   *double_reduc = false;
1581
1582   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1583      otherwise, we assume outer loop vectorization.  */
1584   gcc_assert ((check_reduction && loop == vect_loop)
1585               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1586
1587   name = PHI_RESULT (phi);
1588   nloop_uses = 0;
1589   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1590     {
1591       gimple use_stmt = USE_STMT (use_p);
1592       if (is_gimple_debug (use_stmt))
1593         continue;
1594       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1595           && vinfo_for_stmt (use_stmt)
1596           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1597         nloop_uses++;
1598       if (nloop_uses > 1)
1599         {
1600           if (vect_print_dump_info (REPORT_DETAILS))
1601             fprintf (vect_dump, "reduction used in loop.");
1602           return NULL;
1603         }
1604     }
1605
1606   if (TREE_CODE (loop_arg) != SSA_NAME)
1607     {
1608       if (vect_print_dump_info (REPORT_DETAILS))
1609         {
1610           fprintf (vect_dump, "reduction: not ssa_name: ");
1611           print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
1612         }
1613       return NULL;
1614     }
1615
1616   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
1617   if (!def_stmt)
1618     {
1619       if (vect_print_dump_info (REPORT_DETAILS))
1620         fprintf (vect_dump, "reduction: no def_stmt.");
1621       return NULL;
1622     }
1623
1624   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
1625     {
1626       if (vect_print_dump_info (REPORT_DETAILS))
1627         print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1628       return NULL;
1629     }
1630
1631   if (is_gimple_assign (def_stmt))
1632     {
1633       name = gimple_assign_lhs (def_stmt);
1634       phi_def = false;
1635     }
1636   else
1637     {
1638       name = PHI_RESULT (def_stmt);
1639       phi_def = true;
1640     }
1641
1642   nloop_uses = 0;
1643   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1644     {
1645       gimple use_stmt = USE_STMT (use_p);
1646       if (is_gimple_debug (use_stmt))
1647         continue;
1648       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1649           && vinfo_for_stmt (use_stmt)
1650           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1651         nloop_uses++;
1652       if (nloop_uses > 1)
1653         {
1654           if (vect_print_dump_info (REPORT_DETAILS))
1655             fprintf (vect_dump, "reduction used in loop.");
1656           return NULL;
1657         }
1658     }
1659
1660   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
1661      defined in the inner loop.  */
1662   if (phi_def)
1663     {
1664       op1 = PHI_ARG_DEF (def_stmt, 0);
1665
1666       if (gimple_phi_num_args (def_stmt) != 1
1667           || TREE_CODE (op1) != SSA_NAME)
1668         {
1669           if (vect_print_dump_info (REPORT_DETAILS))
1670             fprintf (vect_dump, "unsupported phi node definition.");
1671
1672           return NULL;
1673         }
1674
1675       def1 = SSA_NAME_DEF_STMT (op1);
1676       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1677           && loop->inner
1678           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
1679           && is_gimple_assign (def1))
1680         {
1681           if (vect_print_dump_info (REPORT_DETAILS))
1682             report_vect_op (def_stmt, "detected double reduction: ");
1683
1684           *double_reduc = true;
1685           return def_stmt;
1686         }
1687
1688       return NULL;
1689     }
1690
1691   code = gimple_assign_rhs_code (def_stmt);
1692
1693   if (check_reduction
1694       && (!commutative_tree_code (code) || !associative_tree_code (code)))
1695     {
1696       if (vect_print_dump_info (REPORT_DETAILS))
1697         report_vect_op (def_stmt, "reduction: not commutative/associative: ");
1698       return NULL;
1699     }
1700
1701   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
1702     {
1703       if (code != COND_EXPR)
1704         {
1705           if (vect_print_dump_info (REPORT_DETAILS))
1706             report_vect_op (def_stmt, "reduction: not binary operation: ");
1707
1708           return NULL;
1709         }
1710
1711       op3 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
1712       if (COMPARISON_CLASS_P (op3))
1713         {
1714           op4 = TREE_OPERAND (op3, 1);
1715           op3 = TREE_OPERAND (op3, 0);
1716         }
1717
1718       op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1);
1719       op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2);
1720
1721       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
1722         {
1723           if (vect_print_dump_info (REPORT_DETAILS))
1724             report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1725
1726           return NULL;
1727         }
1728     }
1729   else
1730     {
1731       op1 = gimple_assign_rhs1 (def_stmt);
1732       op2 = gimple_assign_rhs2 (def_stmt);
1733
1734       if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
1735         {
1736           if (vect_print_dump_info (REPORT_DETAILS))
1737             report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1738
1739           return NULL;
1740         }
1741    }
1742
1743   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
1744   if ((TREE_CODE (op1) == SSA_NAME
1745        && !types_compatible_p (type,TREE_TYPE (op1)))
1746       || (TREE_CODE (op2) == SSA_NAME
1747           && !types_compatible_p (type, TREE_TYPE (op2)))
1748       || (op3 && TREE_CODE (op3) == SSA_NAME
1749           && !types_compatible_p (type, TREE_TYPE (op3)))
1750       || (op4 && TREE_CODE (op4) == SSA_NAME
1751           && !types_compatible_p (type, TREE_TYPE (op4))))
1752     {
1753       if (vect_print_dump_info (REPORT_DETAILS))
1754         {
1755           fprintf (vect_dump, "reduction: multiple types: operation type: ");
1756           print_generic_expr (vect_dump, type, TDF_SLIM);
1757           fprintf (vect_dump, ", operands types: ");
1758           print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
1759           fprintf (vect_dump, ",");
1760           print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
1761           if (op3)
1762             {
1763               fprintf (vect_dump, ",");
1764               print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
1765             }
1766
1767           if (op4)
1768             {
1769               fprintf (vect_dump, ",");
1770               print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
1771             }
1772         }
1773
1774       return NULL;
1775     }
1776
1777   /* Check that it's ok to change the order of the computation.
1778      Generally, when vectorizing a reduction we change the order of the
1779      computation.  This may change the behavior of the program in some
1780      cases, so we need to check that this is ok.  One exception is when
1781      vectorizing an outer-loop: the inner-loop is executed sequentially,
1782      and therefore vectorizing reductions in the inner-loop during
1783      outer-loop vectorization is safe.  */
1784
1785   /* CHECKME: check for !flag_finite_math_only too?  */
1786   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
1787       && check_reduction)
1788     {
1789       /* Changing the order of operations changes the semantics.  */
1790       if (vect_print_dump_info (REPORT_DETAILS))
1791         report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
1792       return NULL;
1793     }
1794   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
1795            && check_reduction)
1796     {
1797       /* Changing the order of operations changes the semantics.  */
1798       if (vect_print_dump_info (REPORT_DETAILS))
1799         report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
1800       return NULL;
1801     }
1802   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
1803     {
1804       /* Changing the order of operations changes the semantics.  */
1805       if (vect_print_dump_info (REPORT_DETAILS))
1806         report_vect_op (def_stmt,
1807                         "reduction: unsafe fixed-point math optimization: ");
1808       return NULL;
1809     }
1810
1811   /* Reduction is safe. We're dealing with one of the following:
1812      1) integer arithmetic and no trapv
1813      2) floating point arithmetic, and special flags permit this optimization
1814      3) nested cycle (i.e., outer loop vectorization).  */
1815   if (TREE_CODE (op1) == SSA_NAME)
1816     def1 = SSA_NAME_DEF_STMT (op1);
1817
1818   if (TREE_CODE (op2) == SSA_NAME)
1819     def2 = SSA_NAME_DEF_STMT (op2);
1820
1821   if (code != COND_EXPR
1822       && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2)))
1823     {
1824       if (vect_print_dump_info (REPORT_DETAILS))
1825         report_vect_op (def_stmt, "reduction: no defs for operands: ");
1826       return NULL;
1827     }
1828
1829   /* Check that one def is the reduction def, defined by PHI,
1830      the other def is either defined in the loop ("vect_internal_def"),
1831      or it's an induction (defined by a loop-header phi-node).  */
1832
1833   if (def2 && def2 == phi
1834       && (code == COND_EXPR
1835           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
1836               && (is_gimple_assign (def1)
1837                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1838                       == vect_induction_def
1839                   || (gimple_code (def1) == GIMPLE_PHI
1840                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1841                           == vect_internal_def
1842                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
1843     {
1844       if (vect_print_dump_info (REPORT_DETAILS))
1845         report_vect_op (def_stmt, "detected reduction: ");
1846       return def_stmt;
1847     }
1848   else if (def1 && def1 == phi
1849            && (code == COND_EXPR
1850                || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
1851                    && (is_gimple_assign (def2)
1852                        || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1853                            == vect_induction_def
1854                        || (gimple_code (def2) == GIMPLE_PHI
1855                            && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1856                                == vect_internal_def
1857                            && !is_loop_header_bb_p (gimple_bb (def2)))))))
1858     {
1859       if (check_reduction)
1860         {
1861           /* Swap operands (just for simplicity - so that the rest of the code
1862              can assume that the reduction variable is always the last (second)
1863              argument).  */
1864           if (vect_print_dump_info (REPORT_DETAILS))
1865             report_vect_op (def_stmt,
1866                             "detected reduction: need to swap operands: ");
1867
1868           swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
1869                               gimple_assign_rhs2_ptr (def_stmt));
1870         }
1871       else
1872         {
1873           if (vect_print_dump_info (REPORT_DETAILS))
1874             report_vect_op (def_stmt, "detected reduction: ");
1875         }
1876
1877       return def_stmt;
1878     }
1879   else
1880     {
1881       if (vect_print_dump_info (REPORT_DETAILS))
1882         report_vect_op (def_stmt, "reduction: unknown pattern: ");
1883
1884       return NULL;
1885     }
1886 }
1887
1888
1889 /* Function vect_estimate_min_profitable_iters
1890
1891    Return the number of iterations required for the vector version of the
1892    loop to be profitable relative to the cost of the scalar version of the
1893    loop.
1894
1895    TODO: Take profile info into account before making vectorization
1896    decisions, if available.  */
1897
1898 int
1899 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
1900 {
1901   int i;
1902   int min_profitable_iters;
1903   int peel_iters_prologue;
1904   int peel_iters_epilogue;
1905   int vec_inside_cost = 0;
1906   int vec_outside_cost = 0;
1907   int scalar_single_iter_cost = 0;
1908   int scalar_outside_cost = 0;
1909   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1910   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1911   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1912   int nbbs = loop->num_nodes;
1913   int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
1914   int peel_guard_costs = 0;
1915   int innerloop_iters = 0, factor;
1916   VEC (slp_instance, heap) *slp_instances;
1917   slp_instance instance;
1918
1919   /* Cost model disabled.  */
1920   if (!flag_vect_cost_model)
1921     {
1922       if (vect_print_dump_info (REPORT_COST))
1923         fprintf (vect_dump, "cost model disabled.");
1924       return 0;
1925     }
1926
1927   /* Requires loop versioning tests to handle misalignment.  */
1928   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1929     {
1930       /*  FIXME: Make cost depend on complexity of individual check.  */
1931       vec_outside_cost +=
1932         VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
1933       if (vect_print_dump_info (REPORT_COST))
1934         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
1935                  "versioning to treat misalignment.\n");
1936     }
1937
1938   /* Requires loop versioning with alias checks.  */
1939   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
1940     {
1941       /*  FIXME: Make cost depend on complexity of individual check.  */
1942       vec_outside_cost +=
1943         VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
1944       if (vect_print_dump_info (REPORT_COST))
1945         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
1946                  "versioning aliasing.\n");
1947     }
1948
1949   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
1950       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
1951     vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
1952
1953   /* Count statements in scalar loop.  Using this as scalar cost for a single
1954      iteration for now.
1955
1956      TODO: Add outer loop support.
1957
1958      TODO: Consider assigning different costs to different scalar
1959      statements.  */
1960
1961   /* FORNOW.  */
1962   if (loop->inner)
1963     innerloop_iters = 50; /* FIXME */
1964
1965   for (i = 0; i < nbbs; i++)
1966     {
1967       gimple_stmt_iterator si;
1968       basic_block bb = bbs[i];
1969
1970       if (bb->loop_father == loop->inner)
1971         factor = innerloop_iters;
1972       else
1973         factor = 1;
1974
1975       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1976         {
1977           gimple stmt = gsi_stmt (si);
1978           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1979           /* Skip stmts that are not vectorized inside the loop.  */
1980           if (!STMT_VINFO_RELEVANT_P (stmt_info)
1981               && (!STMT_VINFO_LIVE_P (stmt_info)
1982                   || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
1983             continue;
1984           scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
1985           vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
1986           /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
1987              some of the "outside" costs are generated inside the outer-loop.  */
1988           vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
1989         }
1990     }
1991
1992   /* Add additional cost for the peeled instructions in prologue and epilogue
1993      loop.
1994
1995      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
1996      at compile-time - we assume it's vf/2 (the worst would be vf-1).
1997
1998      TODO: Build an expression that represents peel_iters for prologue and
1999      epilogue to be used in a run-time test.  */
2000
2001   if (byte_misalign < 0)
2002     {
2003       peel_iters_prologue = vf/2;
2004       if (vect_print_dump_info (REPORT_COST))
2005         fprintf (vect_dump, "cost model: "
2006                  "prologue peel iters set to vf/2.");
2007
2008       /* If peeling for alignment is unknown, loop bound of main loop becomes
2009          unknown.  */
2010       peel_iters_epilogue = vf/2;
2011       if (vect_print_dump_info (REPORT_COST))
2012         fprintf (vect_dump, "cost model: "
2013                  "epilogue peel iters set to vf/2 because "
2014                  "peeling for alignment is unknown .");
2015
2016       /* If peeled iterations are unknown, count a taken branch and a not taken
2017          branch per peeled loop. Even if scalar loop iterations are known,
2018          vector iterations are not known since peeled prologue iterations are
2019          not known. Hence guards remain the same.  */
2020       peel_guard_costs +=  2 * (TARG_COND_TAKEN_BRANCH_COST
2021                               + TARG_COND_NOT_TAKEN_BRANCH_COST);
2022     }
2023   else
2024     {
2025       if (byte_misalign)
2026         {
2027           struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2028           int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
2029           tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2030           int nelements = TYPE_VECTOR_SUBPARTS (vectype);
2031
2032           peel_iters_prologue = nelements - (byte_misalign / element_size);
2033         }
2034       else
2035         peel_iters_prologue = 0;
2036
2037       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2038         {
2039           peel_iters_epilogue = vf/2;
2040           if (vect_print_dump_info (REPORT_COST))
2041             fprintf (vect_dump, "cost model: "
2042                      "epilogue peel iters set to vf/2 because "
2043                      "loop iterations are unknown .");
2044
2045           /* If peeled iterations are known but number of scalar loop
2046              iterations are unknown, count a taken branch per peeled loop.  */
2047           peel_guard_costs +=  2 * TARG_COND_TAKEN_BRANCH_COST;
2048
2049         }
2050       else
2051         {
2052           int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2053           peel_iters_prologue = niters < peel_iters_prologue ?
2054                                         niters : peel_iters_prologue;
2055           peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2056         }
2057     }
2058
2059   vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2060                       + (peel_iters_epilogue * scalar_single_iter_cost)
2061                       + peel_guard_costs;
2062
2063   /* FORNOW: The scalar outside cost is incremented in one of the
2064      following ways:
2065
2066      1. The vectorizer checks for alignment and aliasing and generates
2067      a condition that allows dynamic vectorization.  A cost model
2068      check is ANDED with the versioning condition.  Hence scalar code
2069      path now has the added cost of the versioning check.
2070
2071        if (cost > th & versioning_check)
2072          jmp to vector code
2073
2074      Hence run-time scalar is incremented by not-taken branch cost.
2075
2076      2. The vectorizer then checks if a prologue is required.  If the
2077      cost model check was not done before during versioning, it has to
2078      be done before the prologue check.
2079
2080        if (cost <= th)
2081          prologue = scalar_iters
2082        if (prologue == 0)
2083          jmp to vector code
2084        else
2085          execute prologue
2086        if (prologue == num_iters)
2087          go to exit
2088
2089      Hence the run-time scalar cost is incremented by a taken branch,
2090      plus a not-taken branch, plus a taken branch cost.
2091
2092      3. The vectorizer then checks if an epilogue is required.  If the
2093      cost model check was not done before during prologue check, it
2094      has to be done with the epilogue check.
2095
2096        if (prologue == 0)
2097          jmp to vector code
2098        else
2099          execute prologue
2100        if (prologue == num_iters)
2101          go to exit
2102        vector code:
2103          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2104            jmp to epilogue
2105
2106      Hence the run-time scalar cost should be incremented by 2 taken
2107      branches.
2108
2109      TODO: The back end may reorder the BBS's differently and reverse
2110      conditions/branch directions.  Change the estimates below to
2111      something more reasonable.  */
2112
2113   /* If the number of iterations is known and we do not do versioning, we can
2114      decide whether to vectorize at compile time. Hence the scalar version
2115      do not carry cost model guard costs.  */
2116   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2117       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2118       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2119     {
2120       /* Cost model check occurs at versioning.  */
2121       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2122           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2123         scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
2124       else
2125         {
2126           /* Cost model check occurs at prologue generation.  */
2127           if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2128             scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
2129               + TARG_COND_NOT_TAKEN_BRANCH_COST;
2130           /* Cost model check occurs at epilogue generation.  */
2131           else
2132             scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
2133         }
2134     }
2135
2136   /* Add SLP costs.  */
2137   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2138   for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
2139     {
2140       vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2141       vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2142     }
2143
2144   /* Calculate number of iterations required to make the vector version
2145      profitable, relative to the loop bodies only. The following condition
2146      must hold true:
2147      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2148      where
2149      SIC = scalar iteration cost, VIC = vector iteration cost,
2150      VOC = vector outside cost, VF = vectorization factor,
2151      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2152      SOC = scalar outside cost for run time cost model check.  */
2153
2154   if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2155     {
2156       if (vec_outside_cost <= 0)
2157         min_profitable_iters = 1;
2158       else
2159         {
2160           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2161                                   - vec_inside_cost * peel_iters_prologue
2162                                   - vec_inside_cost * peel_iters_epilogue)
2163                                  / ((scalar_single_iter_cost * vf)
2164                                     - vec_inside_cost);
2165
2166           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2167               <= ((vec_inside_cost * min_profitable_iters)
2168                   + ((vec_outside_cost - scalar_outside_cost) * vf)))
2169             min_profitable_iters++;
2170         }
2171     }
2172   /* vector version will never be profitable.  */
2173   else
2174     {
2175       if (vect_print_dump_info (REPORT_COST))
2176         fprintf (vect_dump, "cost model: the vector iteration cost = %d "
2177                  "divided by the scalar iteration cost = %d "
2178                  "is greater or equal to the vectorization factor = %d.",
2179                  vec_inside_cost, scalar_single_iter_cost, vf);
2180       return -1;
2181     }
2182
2183   if (vect_print_dump_info (REPORT_COST))
2184     {
2185       fprintf (vect_dump, "Cost model analysis: \n");
2186       fprintf (vect_dump, "  Vector inside of loop cost: %d\n",
2187                vec_inside_cost);
2188       fprintf (vect_dump, "  Vector outside of loop cost: %d\n",
2189                vec_outside_cost);
2190       fprintf (vect_dump, "  Scalar iteration cost: %d\n",
2191                scalar_single_iter_cost);
2192       fprintf (vect_dump, "  Scalar outside cost: %d\n", scalar_outside_cost);
2193       fprintf (vect_dump, "  prologue iterations: %d\n",
2194                peel_iters_prologue);
2195       fprintf (vect_dump, "  epilogue iterations: %d\n",
2196                peel_iters_epilogue);
2197       fprintf (vect_dump, "  Calculated minimum iters for profitability: %d\n",
2198                min_profitable_iters);
2199     }
2200
2201   min_profitable_iters =
2202         min_profitable_iters < vf ? vf : min_profitable_iters;
2203
2204   /* Because the condition we create is:
2205      if (niters <= min_profitable_iters)
2206        then skip the vectorized loop.  */
2207   min_profitable_iters--;
2208
2209   if (vect_print_dump_info (REPORT_COST))
2210     fprintf (vect_dump, "  Profitability threshold = %d\n",
2211              min_profitable_iters);
2212
2213   return min_profitable_iters;
2214 }
2215
2216
2217 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2218    functions. Design better to avoid maintenance issues.  */
2219
2220 /* Function vect_model_reduction_cost.
2221
2222    Models cost for a reduction operation, including the vector ops
2223    generated within the strip-mine loop, the initial definition before
2224    the loop, and the epilogue code that must be generated.  */
2225
2226 static bool
2227 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2228                            int ncopies)
2229 {
2230   int outer_cost = 0;
2231   enum tree_code code;
2232   optab optab;
2233   tree vectype;
2234   gimple stmt, orig_stmt;
2235   tree reduction_op;
2236   enum machine_mode mode;
2237   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2238   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2239
2240
2241   /* Cost of reduction op inside loop.  */
2242   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
2243
2244   stmt = STMT_VINFO_STMT (stmt_info);
2245
2246   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2247     {
2248     case GIMPLE_SINGLE_RHS:
2249       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2250       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2251       break;
2252     case GIMPLE_UNARY_RHS:
2253       reduction_op = gimple_assign_rhs1 (stmt);
2254       break;
2255     case GIMPLE_BINARY_RHS:
2256       reduction_op = gimple_assign_rhs2 (stmt);
2257       break;
2258     default:
2259       gcc_unreachable ();
2260     }
2261
2262   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2263   if (!vectype)
2264     {
2265       if (vect_print_dump_info (REPORT_COST))
2266         {
2267           fprintf (vect_dump, "unsupported data-type ");
2268           print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2269         }
2270       return false;
2271    }
2272
2273   mode = TYPE_MODE (vectype);
2274   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2275
2276   if (!orig_stmt)
2277     orig_stmt = STMT_VINFO_STMT (stmt_info);
2278
2279   code = gimple_assign_rhs_code (orig_stmt);
2280
2281   /* Add in cost for initial definition.  */
2282   outer_cost += TARG_SCALAR_TO_VEC_COST;
2283
2284   /* Determine cost of epilogue code.
2285
2286      We have a reduction operator that will reduce the vector in one statement.
2287      Also requires scalar extract.  */
2288
2289   if (!nested_in_vect_loop_p (loop, orig_stmt))
2290     {
2291       if (reduc_code != ERROR_MARK)
2292         outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
2293       else
2294         {
2295           int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2296           tree bitsize =
2297             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2298           int element_bitsize = tree_low_cst (bitsize, 1);
2299           int nelements = vec_size_in_bits / element_bitsize;
2300
2301           optab = optab_for_tree_code (code, vectype, optab_default);
2302
2303           /* We have a whole vector shift available.  */
2304           if (VECTOR_MODE_P (mode)
2305               && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
2306               && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2307             /* Final reduction via vector shifts and the reduction operator. Also
2308                requires scalar extract.  */
2309             outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
2310                                 + TARG_VEC_TO_SCALAR_COST);
2311           else
2312             /* Use extracts and reduction op for final reduction.  For N elements,
2313                we have N extracts and N-1 reduction ops.  */
2314             outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
2315         }
2316     }
2317
2318   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2319
2320   if (vect_print_dump_info (REPORT_COST))
2321     fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2322              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2323              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2324
2325   return true;
2326 }
2327
2328
2329 /* Function vect_model_induction_cost.
2330
2331    Models cost for induction operations.  */
2332
2333 static void
2334 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2335 {
2336   /* loop cost for vec_loop.  */
2337   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
2338   /* prologue cost for vec_init and vec_step.  */
2339   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
2340
2341   if (vect_print_dump_info (REPORT_COST))
2342     fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2343              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2344              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2345 }
2346
2347
2348 /* Function get_initial_def_for_induction
2349
2350    Input:
2351    STMT - a stmt that performs an induction operation in the loop.
2352    IV_PHI - the initial value of the induction variable
2353
2354    Output:
2355    Return a vector variable, initialized with the first VF values of
2356    the induction variable. E.g., for an iv with IV_PHI='X' and
2357    evolution S, for a vector of 4 units, we want to return:
2358    [X, X + S, X + 2*S, X + 3*S].  */
2359
2360 static tree
2361 get_initial_def_for_induction (gimple iv_phi)
2362 {
2363   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2364   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2365   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2366   tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
2367   tree vectype;
2368   int nunits;
2369   edge pe = loop_preheader_edge (loop);
2370   struct loop *iv_loop;
2371   basic_block new_bb;
2372   tree vec, vec_init, vec_step, t;
2373   tree access_fn;
2374   tree new_var;
2375   tree new_name;
2376   gimple init_stmt, induction_phi, new_stmt;
2377   tree induc_def, vec_def, vec_dest;
2378   tree init_expr, step_expr;
2379   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2380   int i;
2381   bool ok;
2382   int ncopies;
2383   tree expr;
2384   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2385   bool nested_in_vect_loop = false;
2386   gimple_seq stmts = NULL;
2387   imm_use_iterator imm_iter;
2388   use_operand_p use_p;
2389   gimple exit_phi;
2390   edge latch_e;
2391   tree loop_arg;
2392   gimple_stmt_iterator si;
2393   basic_block bb = gimple_bb (iv_phi);
2394   tree stepvectype;
2395
2396   vectype = get_vectype_for_scalar_type (scalar_type);
2397   gcc_assert (vectype);
2398   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2399   ncopies = vf / nunits;
2400
2401   gcc_assert (phi_info);
2402   gcc_assert (ncopies >= 1);
2403
2404   /* Find the first insertion point in the BB.  */
2405   si = gsi_after_labels (bb);
2406
2407   if (INTEGRAL_TYPE_P (scalar_type))
2408     step_expr = build_int_cst (scalar_type, 0);
2409   else if (POINTER_TYPE_P (scalar_type))
2410     step_expr = build_int_cst (sizetype, 0);
2411   else
2412     step_expr = build_real (scalar_type, dconst0);
2413
2414   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
2415   if (nested_in_vect_loop_p (loop, iv_phi))
2416     {
2417       nested_in_vect_loop = true;
2418       iv_loop = loop->inner;
2419     }
2420   else
2421     iv_loop = loop;
2422   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2423
2424   latch_e = loop_latch_edge (iv_loop);
2425   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2426
2427   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2428   gcc_assert (access_fn);
2429   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2430                                     &init_expr, &step_expr);
2431   gcc_assert (ok);
2432   pe = loop_preheader_edge (iv_loop);
2433
2434   /* Create the vector that holds the initial_value of the induction.  */
2435   if (nested_in_vect_loop)
2436     {
2437       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
2438          been created during vectorization of previous stmts; We obtain it from
2439          the STMT_VINFO_VEC_STMT of the defining stmt. */
2440       tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2441                                            loop_preheader_edge (iv_loop));
2442       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2443     }
2444   else
2445     {
2446       /* iv_loop is the loop to be vectorized. Create:
2447          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
2448       new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2449       add_referenced_var (new_var);
2450
2451       new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2452       if (stmts)
2453         {
2454           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2455           gcc_assert (!new_bb);
2456         }
2457
2458       t = NULL_TREE;
2459       t = tree_cons (NULL_TREE, init_expr, t);
2460       for (i = 1; i < nunits; i++)
2461         {
2462           /* Create: new_name_i = new_name + step_expr  */
2463           enum tree_code code = POINTER_TYPE_P (scalar_type)
2464                                 ? POINTER_PLUS_EXPR : PLUS_EXPR;
2465           init_stmt = gimple_build_assign_with_ops (code, new_var,
2466                                                     new_name, step_expr);
2467           new_name = make_ssa_name (new_var, init_stmt);
2468           gimple_assign_set_lhs (init_stmt, new_name);
2469
2470           new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
2471           gcc_assert (!new_bb);
2472
2473           if (vect_print_dump_info (REPORT_DETAILS))
2474             {
2475               fprintf (vect_dump, "created new init_stmt: ");
2476               print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
2477             }
2478           t = tree_cons (NULL_TREE, new_name, t);
2479         }
2480       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
2481       vec = build_constructor_from_list (vectype, nreverse (t));
2482       vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
2483     }
2484
2485
2486   /* Create the vector that holds the step of the induction.  */
2487   if (nested_in_vect_loop)
2488     /* iv_loop is nested in the loop to be vectorized. Generate:
2489        vec_step = [S, S, S, S]  */
2490     new_name = step_expr;
2491   else
2492     {
2493       /* iv_loop is the loop to be vectorized. Generate:
2494           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
2495       expr = build_int_cst (TREE_TYPE (step_expr), vf);
2496       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2497                               expr, step_expr);
2498     }
2499
2500   t = NULL_TREE;
2501   for (i = 0; i < nunits; i++)
2502     t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2503   gcc_assert (CONSTANT_CLASS_P (new_name));
2504   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
2505   gcc_assert (stepvectype);
2506   vec = build_vector (stepvectype, t);
2507   vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2508
2509
2510   /* Create the following def-use cycle:
2511      loop prolog:
2512          vec_init = ...
2513          vec_step = ...
2514      loop:
2515          vec_iv = PHI <vec_init, vec_loop>
2516          ...
2517          STMT
2518          ...
2519          vec_loop = vec_iv + vec_step;  */
2520
2521   /* Create the induction-phi that defines the induction-operand.  */
2522   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
2523   add_referenced_var (vec_dest);
2524   induction_phi = create_phi_node (vec_dest, iv_loop->header);
2525   set_vinfo_for_stmt (induction_phi,
2526                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
2527   induc_def = PHI_RESULT (induction_phi);
2528
2529   /* Create the iv update inside the loop  */
2530   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2531                                            induc_def, vec_step);
2532   vec_def = make_ssa_name (vec_dest, new_stmt);
2533   gimple_assign_set_lhs (new_stmt, vec_def);
2534   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2535   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
2536                                                    NULL));
2537
2538   /* Set the arguments of the phi node:  */
2539   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
2540   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
2541                UNKNOWN_LOCATION);
2542
2543
2544   /* In case that vectorization factor (VF) is bigger than the number
2545      of elements that we can fit in a vectype (nunits), we have to generate
2546      more than one vector stmt - i.e - we need to "unroll" the
2547      vector stmt by a factor VF/nunits.  For more details see documentation
2548      in vectorizable_operation.  */
2549
2550   if (ncopies > 1)
2551     {
2552       stmt_vec_info prev_stmt_vinfo;
2553       /* FORNOW. This restriction should be relaxed.  */
2554       gcc_assert (!nested_in_vect_loop);
2555
2556       /* Create the vector that holds the step of the induction.  */
2557       expr = build_int_cst (TREE_TYPE (step_expr), nunits);
2558       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2559                               expr, step_expr);
2560       t = NULL_TREE;
2561       for (i = 0; i < nunits; i++)
2562         t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2563       gcc_assert (CONSTANT_CLASS_P (new_name));
2564       vec = build_vector (stepvectype, t);
2565       vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2566
2567       vec_def = induc_def;
2568       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
2569       for (i = 1; i < ncopies; i++)
2570         {
2571           /* vec_i = vec_prev + vec_step  */
2572           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2573                                                    vec_def, vec_step);
2574           vec_def = make_ssa_name (vec_dest, new_stmt);
2575           gimple_assign_set_lhs (new_stmt, vec_def);
2576
2577           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2578           set_vinfo_for_stmt (new_stmt,
2579                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2580           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
2581           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
2582         }
2583     }
2584
2585   if (nested_in_vect_loop)
2586     {
2587       /* Find the loop-closed exit-phi of the induction, and record
2588          the final vector of induction results:  */
2589       exit_phi = NULL;
2590       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
2591         {
2592           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
2593             {
2594               exit_phi = USE_STMT (use_p);
2595               break;
2596             }
2597         }
2598       if (exit_phi)
2599         {
2600           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2601           /* FORNOW. Currently not supporting the case that an inner-loop induction
2602              is not used in the outer-loop (i.e. only outside the outer-loop).  */
2603           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2604                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
2605
2606           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
2607           if (vect_print_dump_info (REPORT_DETAILS))
2608             {
2609               fprintf (vect_dump, "vector of inductions after inner-loop:");
2610               print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
2611             }
2612         }
2613     }
2614
2615
2616   if (vect_print_dump_info (REPORT_DETAILS))
2617     {
2618       fprintf (vect_dump, "transform induction: created def-use cycle: ");
2619       print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
2620       fprintf (vect_dump, "\n");
2621       print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
2622     }
2623
2624   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
2625   return induc_def;
2626 }
2627
2628
2629 /* Function get_initial_def_for_reduction
2630
2631    Input:
2632    STMT - a stmt that performs a reduction operation in the loop.
2633    INIT_VAL - the initial value of the reduction variable
2634
2635    Output:
2636    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2637         of the reduction (used for adjusting the epilog - see below).
2638    Return a vector variable, initialized according to the operation that STMT
2639         performs. This vector will be used as the initial value of the
2640         vector of partial results.
2641
2642    Option1 (adjust in epilog): Initialize the vector as follows:
2643      add/bit or/xor:    [0,0,...,0,0]
2644      mult/bit and:      [1,1,...,1,1]
2645      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
2646    and when necessary (e.g. add/mult case) let the caller know
2647    that it needs to adjust the result by init_val.
2648
2649    Option2: Initialize the vector as follows:
2650      add/bit or/xor:    [init_val,0,0,...,0]
2651      mult/bit and:      [init_val,1,1,...,1]
2652      min/max/cond_expr: [init_val,init_val,...,init_val]
2653    and no adjustments are needed.
2654
2655    For example, for the following code:
2656
2657    s = init_val;
2658    for (i=0;i<n;i++)
2659      s = s + a[i];
2660
2661    STMT is 's = s + a[i]', and the reduction variable is 's'.
2662    For a vector of 4 units, we want to return either [0,0,0,init_val],
2663    or [0,0,0,0] and let the caller know that it needs to adjust
2664    the result at the end by 'init_val'.
2665
2666    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2667    initialization vector is simpler (same element in all entries), if
2668    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
2669
2670    A cost model should help decide between these two schemes.  */
2671
2672 tree
2673 get_initial_def_for_reduction (gimple stmt, tree init_val,
2674                                tree *adjustment_def)
2675 {
2676   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2677   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2678   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2679   tree scalar_type = TREE_TYPE (init_val);
2680   tree vectype = get_vectype_for_scalar_type (scalar_type);
2681   int nunits;
2682   enum tree_code code = gimple_assign_rhs_code (stmt);
2683   tree def_for_init;
2684   tree init_def;
2685   tree t = NULL_TREE;
2686   int i;
2687   bool nested_in_vect_loop = false;
2688   tree init_value;
2689   REAL_VALUE_TYPE real_init_val = dconst0;
2690   int int_init_val = 0;
2691   gimple def_stmt = NULL;
2692
2693   gcc_assert (vectype);
2694   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2695
2696   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
2697               || SCALAR_FLOAT_TYPE_P (scalar_type));
2698
2699   if (nested_in_vect_loop_p (loop, stmt))
2700     nested_in_vect_loop = true;
2701   else
2702     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2703
2704   /* In case of double reduction we only create a vector variable to be put
2705      in the reduction phi node. The actual statement creation is done in
2706      vect_create_epilog_for_reduction.  */
2707   if (adjustment_def && nested_in_vect_loop
2708       && TREE_CODE (init_val) == SSA_NAME
2709       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
2710       && gimple_code (def_stmt) == GIMPLE_PHI
2711       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2712       && vinfo_for_stmt (def_stmt)
2713       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2714           == vect_double_reduction_def)
2715     {
2716       *adjustment_def = NULL;
2717       return vect_create_destination_var (init_val, vectype);
2718     }
2719
2720   if (TREE_CONSTANT (init_val))
2721     {
2722       if (SCALAR_FLOAT_TYPE_P (scalar_type))
2723         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
2724       else
2725         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
2726     }
2727   else
2728     init_value = init_val;
2729
2730   switch (code)
2731     {
2732       case WIDEN_SUM_EXPR:
2733       case DOT_PROD_EXPR:
2734       case PLUS_EXPR:
2735       case MINUS_EXPR:
2736       case BIT_IOR_EXPR:
2737       case BIT_XOR_EXPR:
2738       case MULT_EXPR:
2739       case BIT_AND_EXPR:
2740         /* ADJUSMENT_DEF is NULL when called from
2741            vect_create_epilog_for_reduction to vectorize double reduction.  */
2742         if (adjustment_def)
2743           {
2744             if (nested_in_vect_loop)
2745               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
2746                                                               NULL);
2747             else
2748               *adjustment_def = init_val;
2749           }
2750
2751         if (code == MULT_EXPR)
2752           {
2753             real_init_val = dconst1;
2754             int_init_val = 1;
2755           }
2756
2757         if (code == BIT_AND_EXPR)
2758           int_init_val = -1;
2759
2760         if (SCALAR_FLOAT_TYPE_P (scalar_type))
2761           def_for_init = build_real (scalar_type, real_init_val);
2762         else
2763           def_for_init = build_int_cst (scalar_type, int_init_val);
2764
2765         /* Create a vector of '0' or '1' except the first element.  */
2766         for (i = nunits - 2; i >= 0; --i)
2767           t = tree_cons (NULL_TREE, def_for_init, t);
2768
2769         /* Option1: the first element is '0' or '1' as well.  */
2770         if (adjustment_def)
2771           {
2772             t = tree_cons (NULL_TREE, def_for_init, t);
2773             init_def = build_vector (vectype, t);
2774             break;
2775           }
2776
2777         /* Option2: the first element is INIT_VAL.  */
2778         t = tree_cons (NULL_TREE, init_value, t);
2779         if (TREE_CONSTANT (init_val))
2780           init_def = build_vector (vectype, t);
2781         else
2782           init_def = build_constructor_from_list (vectype, t);
2783
2784         break;
2785
2786       case MIN_EXPR:
2787       case MAX_EXPR:
2788       case COND_EXPR:
2789         if (adjustment_def)
2790           {
2791             *adjustment_def = NULL_TREE;
2792             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2793             break;
2794           }
2795
2796         for (i = nunits - 1; i >= 0; --i)
2797           t = tree_cons (NULL_TREE, init_value, t);
2798
2799         if (TREE_CONSTANT (init_val))
2800           init_def = build_vector (vectype, t);
2801         else
2802           init_def = build_constructor_from_list (vectype, t);
2803
2804         break;
2805
2806       default:
2807         gcc_unreachable ();
2808     }
2809
2810   return init_def;
2811 }
2812
2813
2814 /* Function vect_create_epilog_for_reduction
2815
2816    Create code at the loop-epilog to finalize the result of a reduction
2817    computation.
2818
2819    VECT_DEF is a vector of partial results.
2820    REDUC_CODE is the tree-code for the epilog reduction.
2821    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
2822      number of elements that we can fit in a vectype (nunits). In this case
2823      we have to generate more than one vector stmt - i.e - we need to "unroll"
2824      the vector stmt by a factor VF/nunits.  For more details see documentation
2825      in vectorizable_operation.
2826    STMT is the scalar reduction stmt that is being vectorized.
2827    REDUCTION_PHI is the phi-node that carries the reduction computation.
2828    REDUC_INDEX is the index of the operand in the right hand side of the
2829      statement that is defined by REDUCTION_PHI.
2830    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
2831
2832    This function:
2833    1. Creates the reduction def-use cycle: sets the arguments for
2834       REDUCTION_PHI:
2835       The loop-entry argument is the vectorized initial-value of the reduction.
2836       The loop-latch argument is VECT_DEF - the vector of partial sums.
2837    2. "Reduces" the vector of partial results VECT_DEF into a single result,
2838       by applying the operation specified by REDUC_CODE if available, or by
2839       other means (whole-vector shifts or a scalar loop).
2840       The function also creates a new phi node at the loop exit to preserve
2841       loop-closed form, as illustrated below.
2842
2843      The flow at the entry to this function:
2844
2845         loop:
2846           vec_def = phi <null, null>            # REDUCTION_PHI
2847           VECT_DEF = vector_stmt                # vectorized form of STMT
2848           s_loop = scalar_stmt                  # (scalar) STMT
2849         loop_exit:
2850           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
2851           use <s_out0>
2852           use <s_out0>
2853
2854      The above is transformed by this function into:
2855
2856         loop:
2857           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
2858           VECT_DEF = vector_stmt                # vectorized form of STMT
2859           s_loop = scalar_stmt                  # (scalar) STMT
2860         loop_exit:
2861           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
2862           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
2863           v_out2 = reduce <v_out1>
2864           s_out3 = extract_field <v_out2, 0>
2865           s_out4 = adjust_result <s_out3>
2866           use <s_out4>
2867           use <s_out4>
2868 */
2869
2870 static void
2871 vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
2872                                   int ncopies,
2873                                   enum tree_code reduc_code,
2874                                   gimple reduction_phi,
2875                                   int reduc_index,
2876                                   bool double_reduc)
2877 {
2878   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2879   stmt_vec_info prev_phi_info;
2880   tree vectype;
2881   enum machine_mode mode;
2882   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2883   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
2884   basic_block exit_bb;
2885   tree scalar_dest;
2886   tree scalar_type;
2887   gimple new_phi = NULL, phi;
2888   gimple_stmt_iterator exit_gsi;
2889   tree vec_dest;
2890   tree new_temp = NULL_TREE;
2891   tree new_name;
2892   gimple epilog_stmt = NULL;
2893   tree new_scalar_dest, new_dest;
2894   gimple exit_phi;
2895   tree bitsize, bitpos;
2896   enum tree_code code = gimple_assign_rhs_code (stmt);
2897   tree adjustment_def;
2898   tree vec_initial_def, def;
2899   tree orig_name;
2900   imm_use_iterator imm_iter;
2901   use_operand_p use_p;
2902   bool extract_scalar_result = false;
2903   tree reduction_op, expr;
2904   gimple orig_stmt;
2905   gimple use_stmt;
2906   bool nested_in_vect_loop = false;
2907   VEC(gimple,heap) *phis = NULL;
2908   enum vect_def_type dt = vect_unknown_def_type;
2909   int j, i;
2910
2911   if (nested_in_vect_loop_p (loop, stmt))
2912     {
2913       outer_loop = loop;
2914       loop = loop->inner;
2915       nested_in_vect_loop = true;
2916     }
2917
2918   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2919     {
2920     case GIMPLE_SINGLE_RHS:
2921       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
2922                                        == ternary_op);
2923       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
2924       break;
2925     case GIMPLE_UNARY_RHS:
2926       reduction_op = gimple_assign_rhs1 (stmt);
2927       break;
2928     case GIMPLE_BINARY_RHS:
2929       reduction_op = reduc_index ?
2930                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
2931       break;
2932     default:
2933       gcc_unreachable ();
2934     }
2935
2936   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2937   gcc_assert (vectype);
2938   mode = TYPE_MODE (vectype);
2939
2940   /*** 1. Create the reduction def-use cycle  ***/
2941
2942   /* For the case of reduction, vect_get_vec_def_for_operand returns
2943      the scalar def before the loop, that defines the initial value
2944      of the reduction variable.  */
2945   vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2946                                                   &adjustment_def);
2947
2948   phi = reduction_phi;
2949   def = vect_def;
2950   for (j = 0; j < ncopies; j++)
2951     {
2952       /* 1.1 set the loop-entry arg of the reduction-phi:  */
2953       add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop),
2954                    UNKNOWN_LOCATION);
2955
2956       /* 1.2 set the loop-latch arg for the reduction-phi:  */
2957       if (j > 0)
2958         def = vect_get_vec_def_for_stmt_copy (dt, def);
2959       add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
2960
2961       if (vect_print_dump_info (REPORT_DETAILS))
2962         {
2963           fprintf (vect_dump, "transform reduction: created def-use cycle: ");
2964           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
2965           fprintf (vect_dump, "\n");
2966           print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
2967         }
2968
2969       phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
2970     }
2971
2972   /*** 2. Create epilog code
2973           The reduction epilog code operates across the elements of the vector
2974           of partial results computed by the vectorized loop.
2975           The reduction epilog code consists of:
2976           step 1: compute the scalar result in a vector (v_out2)
2977           step 2: extract the scalar result (s_out3) from the vector (v_out2)
2978           step 3: adjust the scalar result (s_out3) if needed.
2979
2980           Step 1 can be accomplished using one the following three schemes:
2981           (scheme 1) using reduc_code, if available.
2982           (scheme 2) using whole-vector shifts, if available.
2983           (scheme 3) using a scalar loop. In this case steps 1+2 above are
2984                      combined.
2985
2986           The overall epilog code looks like this:
2987
2988           s_out0 = phi <s_loop>         # original EXIT_PHI
2989           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
2990           v_out2 = reduce <v_out1>              # step 1
2991           s_out3 = extract_field <v_out2, 0>    # step 2
2992           s_out4 = adjust_result <s_out3>       # step 3
2993
2994           (step 3 is optional, and steps 1 and 2 may be combined).
2995           Lastly, the uses of s_out0 are replaced by s_out4.
2996
2997           ***/
2998
2999   /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
3000         v_out1 = phi <v_loop>  */
3001
3002   exit_bb = single_exit (loop)->dest;
3003   def = vect_def;
3004   prev_phi_info = NULL;
3005   for (j = 0; j < ncopies; j++)
3006     {
3007       phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
3008       set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3009       if (j == 0)
3010         new_phi = phi;
3011       else
3012         {
3013           def = vect_get_vec_def_for_stmt_copy (dt, def);
3014           STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3015         }
3016       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3017       prev_phi_info = vinfo_for_stmt (phi);
3018     }
3019
3020   exit_gsi = gsi_after_labels (exit_bb);
3021
3022   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3023          (i.e. when reduc_code is not available) and in the final adjustment
3024          code (if needed).  Also get the original scalar reduction variable as
3025          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
3026          represents a reduction pattern), the tree-code and scalar-def are
3027          taken from the original stmt that the pattern-stmt (STMT) replaces.
3028          Otherwise (it is a regular reduction) - the tree-code and scalar-def
3029          are taken from STMT.  */
3030
3031   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3032   if (!orig_stmt)
3033     {
3034       /* Regular reduction  */
3035       orig_stmt = stmt;
3036     }
3037   else
3038     {
3039       /* Reduction pattern  */
3040       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3041       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3042       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3043     }
3044
3045   code = gimple_assign_rhs_code (orig_stmt);
3046   scalar_dest = gimple_assign_lhs (orig_stmt);
3047   scalar_type = TREE_TYPE (scalar_dest);
3048   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3049   bitsize = TYPE_SIZE (scalar_type);
3050
3051   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3052      partial results are added and not subtracted.  */
3053   if (code == MINUS_EXPR)
3054     code = PLUS_EXPR;
3055
3056   /* In case this is a reduction in an inner-loop while vectorizing an outer
3057      loop - we don't need to extract a single scalar result at the end of the
3058      inner-loop (unless it is double reduction, i.e., the use of reduction is
3059      outside the outer-loop). The final vector of partial results will be used
3060      in the vectorized outer-loop, or reduced to a scalar result at the end of
3061      the outer-loop.  */
3062   if (nested_in_vect_loop && !double_reduc)
3063     goto vect_finalize_reduction;
3064
3065   /* The epilogue is created for the outer-loop, i.e., for the loop being
3066      vectorized.  */
3067   if (double_reduc)
3068     loop = outer_loop;
3069
3070   /* FORNOW */
3071   gcc_assert (ncopies == 1);
3072
3073   /* 2.3 Create the reduction code, using one of the three schemes described
3074          above.  */
3075
3076   if (reduc_code != ERROR_MARK)
3077     {
3078       tree tmp;
3079
3080       /*** Case 1:  Create:
3081            v_out2 = reduc_expr <v_out1>  */
3082
3083       if (vect_print_dump_info (REPORT_DETAILS))
3084         fprintf (vect_dump, "Reduce using direct vector reduction.");
3085
3086       vec_dest = vect_create_destination_var (scalar_dest, vectype);
3087       tmp = build1 (reduc_code, vectype,  PHI_RESULT (new_phi));
3088       epilog_stmt = gimple_build_assign (vec_dest, tmp);
3089       new_temp = make_ssa_name (vec_dest, epilog_stmt);
3090       gimple_assign_set_lhs (epilog_stmt, new_temp);
3091       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3092
3093       extract_scalar_result = true;
3094     }
3095   else
3096     {
3097       enum tree_code shift_code = ERROR_MARK;
3098       bool have_whole_vector_shift = true;
3099       int bit_offset;
3100       int element_bitsize = tree_low_cst (bitsize, 1);
3101       int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3102       tree vec_temp;
3103
3104       if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
3105         shift_code = VEC_RSHIFT_EXPR;
3106       else
3107         have_whole_vector_shift = false;
3108
3109       /* Regardless of whether we have a whole vector shift, if we're
3110          emulating the operation via tree-vect-generic, we don't want
3111          to use it.  Only the first round of the reduction is likely
3112          to still be profitable via emulation.  */
3113       /* ??? It might be better to emit a reduction tree code here, so that
3114          tree-vect-generic can expand the first round via bit tricks.  */
3115       if (!VECTOR_MODE_P (mode))
3116         have_whole_vector_shift = false;
3117       else
3118         {
3119           optab optab = optab_for_tree_code (code, vectype, optab_default);
3120           if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
3121             have_whole_vector_shift = false;
3122         }
3123
3124       if (have_whole_vector_shift)
3125         {
3126           /*** Case 2: Create:
3127              for (offset = VS/2; offset >= element_size; offset/=2)
3128                 {
3129                   Create:  va' = vec_shift <va, offset>
3130                   Create:  va = vop <va, va'>
3131                 }  */
3132
3133           if (vect_print_dump_info (REPORT_DETAILS))
3134             fprintf (vect_dump, "Reduce using vector shifts");
3135
3136           vec_dest = vect_create_destination_var (scalar_dest, vectype);
3137           new_temp = PHI_RESULT (new_phi);
3138
3139           for (bit_offset = vec_size_in_bits/2;
3140                bit_offset >= element_bitsize;
3141                bit_offset /= 2)
3142             {
3143               tree bitpos = size_int (bit_offset);
3144
3145               epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest,
3146                                                           new_temp, bitpos);
3147               new_name = make_ssa_name (vec_dest, epilog_stmt);
3148               gimple_assign_set_lhs (epilog_stmt, new_name);
3149               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3150
3151               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3152                                                           new_name, new_temp);
3153               new_temp = make_ssa_name (vec_dest, epilog_stmt);
3154               gimple_assign_set_lhs (epilog_stmt, new_temp);
3155               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3156             }
3157
3158           extract_scalar_result = true;
3159         }
3160       else
3161         {
3162           tree rhs;
3163
3164           /*** Case 3: Create:
3165              s = extract_field <v_out2, 0>
3166              for (offset = element_size;
3167                   offset < vector_size;
3168                   offset += element_size;)
3169                {
3170                  Create:  s' = extract_field <v_out2, offset>
3171                  Create:  s = op <s, s'>
3172                }  */
3173
3174           if (vect_print_dump_info (REPORT_DETAILS))
3175             fprintf (vect_dump, "Reduce using scalar code. ");
3176
3177           vec_temp = PHI_RESULT (new_phi);
3178           vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3179           rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3180                          bitsize_zero_node);
3181           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3182           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3183           gimple_assign_set_lhs (epilog_stmt, new_temp);
3184           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3185
3186           for (bit_offset = element_bitsize;
3187                bit_offset < vec_size_in_bits;
3188                bit_offset += element_bitsize)
3189             {
3190               tree bitpos = bitsize_int (bit_offset);
3191               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3192                                  bitpos);
3193
3194               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3195               new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3196               gimple_assign_set_lhs (epilog_stmt, new_name);
3197               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3198
3199               epilog_stmt = gimple_build_assign_with_ops (code,
3200                                                           new_scalar_dest,
3201                                                           new_name, new_temp);
3202               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3203               gimple_assign_set_lhs (epilog_stmt, new_temp);
3204               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3205             }
3206
3207           extract_scalar_result = false;
3208         }
3209     }
3210
3211   /* 2.4  Extract the final scalar result.  Create:
3212          s_out3 = extract_field <v_out2, bitpos>  */
3213
3214   if (extract_scalar_result)
3215     {
3216       tree rhs;
3217
3218       gcc_assert (!nested_in_vect_loop || double_reduc);
3219       if (vect_print_dump_info (REPORT_DETAILS))
3220         fprintf (vect_dump, "extract scalar result");
3221
3222       if (BYTES_BIG_ENDIAN)
3223         bitpos = size_binop (MULT_EXPR,
3224                        bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3225                        TYPE_SIZE (scalar_type));
3226       else
3227         bitpos = bitsize_zero_node;
3228
3229       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3230       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3231       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3232       gimple_assign_set_lhs (epilog_stmt, new_temp);
3233       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3234     }
3235
3236 vect_finalize_reduction:
3237
3238   if (double_reduc)
3239     loop = loop->inner;
3240
3241   /* 2.5 Adjust the final result by the initial value of the reduction
3242          variable. (When such adjustment is not needed, then
3243          'adjustment_def' is zero).  For example, if code is PLUS we create:
3244          new_temp = loop_exit_def + adjustment_def  */
3245
3246   if (adjustment_def)
3247     {
3248       if (nested_in_vect_loop)
3249         {
3250           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3251           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3252           new_dest = vect_create_destination_var (scalar_dest, vectype);
3253         }
3254       else
3255         {
3256           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3257           expr = build2 (code, scalar_type, new_temp, adjustment_def);
3258           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3259         }
3260
3261       epilog_stmt = gimple_build_assign (new_dest, expr);
3262       new_temp = make_ssa_name (new_dest, epilog_stmt);
3263       gimple_assign_set_lhs (epilog_stmt, new_temp);
3264       SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3265       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3266     }
3267
3268
3269   /* 2.6  Handle the loop-exit phi  */
3270
3271   /* Replace uses of s_out0 with uses of s_out3:
3272      Find the loop-closed-use at the loop exit of the original scalar result.
3273      (The reduction result is expected to have two immediate uses - one at the
3274      latch block, and one at the loop exit).  */
3275   phis = VEC_alloc (gimple, heap, 10);
3276   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3277     {
3278       if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3279         {
3280           exit_phi = USE_STMT (use_p);
3281           VEC_quick_push (gimple, phis, exit_phi);
3282         }
3283     }
3284
3285   /* We expect to have found an exit_phi because of loop-closed-ssa form.  */
3286   gcc_assert (!VEC_empty (gimple, phis));
3287
3288   for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
3289     {
3290       if (nested_in_vect_loop)
3291         {
3292           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3293           gimple vect_phi;
3294
3295           /* FORNOW. Currently not supporting the case that an inner-loop
3296              reduction is not used in the outer-loop (but only outside the
3297              outer-loop), unless it is double reduction.  */
3298           gcc_assert ((STMT_VINFO_RELEVANT_P (stmt_vinfo)
3299                       && !STMT_VINFO_LIVE_P (stmt_vinfo)) || double_reduc);
3300
3301           epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
3302           STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
3303           set_vinfo_for_stmt (epilog_stmt,
3304                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
3305                                                  NULL));
3306           if (adjustment_def)
3307             STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3308                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3309
3310           if (!double_reduc
3311               || STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_double_reduction_def)
3312             continue;
3313
3314           /* Handle double reduction:
3315
3316              stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
3317              stmt2:   s3 = phi <s1, s4> - (regular) reduction phi (inner loop)
3318              stmt3:   s4 = use (s3)     - (regular) reduction stmt (inner loop)
3319              stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
3320
3321              At that point the regular reduction (stmt2 and stmt3) is already
3322              vectorized, as well as the exit phi node, stmt4.
3323              Here we vectorize the phi node of double reduction, stmt1, and
3324              update all relevant statements.  */
3325
3326           /* Go through all the uses of s2 to find double reduction phi node,
3327              i.e., stmt1 above.  */
3328           orig_name = PHI_RESULT (exit_phi);
3329           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3330             {
3331               stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
3332               stmt_vec_info new_phi_vinfo;
3333               tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
3334               basic_block bb = gimple_bb (use_stmt);
3335               gimple use;
3336
3337               /* Check that USE_STMT is really double reduction phi node.  */
3338               if (gimple_code (use_stmt) != GIMPLE_PHI
3339                   || gimple_phi_num_args (use_stmt) != 2
3340                   || !use_stmt_vinfo
3341                   || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
3342                       != vect_double_reduction_def
3343                   || bb->loop_father != outer_loop)
3344                 continue;
3345
3346               /* Create vector phi node for double reduction:
3347                  vs1 = phi <vs0, vs2>
3348                  vs1 was created previously in this function by a call to
3349                  vect_get_vec_def_for_operand and is stored in vec_initial_def;
3350                  vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
3351                  vs0 is created here.  */
3352
3353               /* Create vector phi node.  */
3354               vect_phi = create_phi_node (vec_initial_def, bb);
3355               new_phi_vinfo = new_stmt_vec_info (vect_phi,
3356                                     loop_vec_info_for_loop (outer_loop), NULL);
3357               set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
3358
3359               /* Create vs0 - initial def of the double reduction phi.  */
3360               preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
3361                                              loop_preheader_edge (outer_loop));
3362               init_def = get_initial_def_for_reduction (stmt, preheader_arg,
3363                                                         NULL);
3364               vect_phi_init = vect_init_vector (use_stmt, init_def, vectype,
3365                                                 NULL);
3366
3367               /* Update phi node arguments with vs0 and vs2.  */
3368               add_phi_arg (vect_phi, vect_phi_init,
3369                            loop_preheader_edge (outer_loop), UNKNOWN_LOCATION);
3370               add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
3371                            loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
3372               if (vect_print_dump_info (REPORT_DETAILS))
3373                 {
3374                   fprintf (vect_dump, "created double reduction phi node: ");
3375                   print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
3376                 }
3377
3378               vect_phi_res = PHI_RESULT (vect_phi);
3379
3380               /* Replace the use, i.e., set the correct vs1 in the regular
3381                  reduction phi node. FORNOW, NCOPIES is always 1, so the loop
3382                  is redundant.  */
3383               use = reduction_phi;
3384               for (j = 0; j < ncopies; j++)
3385                 {
3386                   edge pr_edge = loop_preheader_edge (loop);
3387                   SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
3388                   use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
3389                 }
3390             }
3391         }
3392
3393       /* Replace the uses:  */
3394       orig_name = PHI_RESULT (exit_phi);
3395       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3396         FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3397           SET_USE (use_p, new_temp);
3398     }
3399
3400   VEC_free (gimple, heap, phis);
3401 }
3402
3403
3404 /* Function vectorizable_reduction.
3405
3406    Check if STMT performs a reduction operation that can be vectorized.
3407    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3408    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
3409    Return FALSE if not a vectorizable STMT, TRUE otherwise.
3410
3411    This function also handles reduction idioms (patterns) that have been
3412    recognized in advance during vect_pattern_recog. In this case, STMT may be
3413    of this form:
3414      X = pattern_expr (arg0, arg1, ..., X)
3415    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
3416    sequence that had been detected and replaced by the pattern-stmt (STMT).
3417
3418    In some cases of reduction patterns, the type of the reduction variable X is
3419    different than the type of the other arguments of STMT.
3420    In such cases, the vectype that is used when transforming STMT into a vector
3421    stmt is different than the vectype that is used to determine the
3422    vectorization factor, because it consists of a different number of elements
3423    than the actual number of elements that are being operated upon in parallel.
3424
3425    For example, consider an accumulation of shorts into an int accumulator.
3426    On some targets it's possible to vectorize this pattern operating on 8
3427    shorts at a time (hence, the vectype for purposes of determining the
3428    vectorization factor should be V8HI); on the other hand, the vectype that
3429    is used to create the vector form is actually V4SI (the type of the result).
3430
3431    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
3432    indicates what is the actual level of parallelism (V8HI in the example), so
3433    that the right vectorization factor would be derived. This vectype
3434    corresponds to the type of arguments to the reduction stmt, and should *NOT*
3435    be used to create the vectorized stmt. The right vectype for the vectorized
3436    stmt is obtained from the type of the result X:
3437         get_vectype_for_scalar_type (TREE_TYPE (X))
3438
3439    This means that, contrary to "regular" reductions (or "regular" stmts in
3440    general), the following equation:
3441       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
3442    does *NOT* necessarily hold for reduction patterns.  */
3443
3444 bool
3445 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
3446                         gimple *vec_stmt)
3447 {
3448   tree vec_dest;
3449   tree scalar_dest;
3450   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
3451   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3452   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3453   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3454   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3455   enum tree_code code, orig_code, epilog_reduc_code;
3456   enum machine_mode vec_mode;
3457   int op_type;
3458   optab optab, reduc_optab;
3459   tree new_temp = NULL_TREE;
3460   tree def;
3461   gimple def_stmt;
3462   enum vect_def_type dt;
3463   gimple new_phi = NULL;
3464   tree scalar_type;
3465   bool is_simple_use;
3466   gimple orig_stmt;
3467   stmt_vec_info orig_stmt_info;
3468   tree expr = NULL_TREE;
3469   int i;
3470   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3471   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3472   int epilog_copies;
3473   stmt_vec_info prev_stmt_info, prev_phi_info;
3474   gimple first_phi = NULL;
3475   bool single_defuse_cycle = false;
3476   tree reduc_def = NULL_TREE;
3477   gimple new_stmt = NULL;
3478   int j;
3479   tree ops[3];
3480   bool nested_cycle = false, found_nested_cycle_def = false;
3481   gimple reduc_def_stmt = NULL;
3482   /* The default is that the reduction variable is the last in statement.  */
3483   int reduc_index = 2;
3484   bool double_reduc = false, dummy;
3485   basic_block def_bb;
3486   struct loop * def_stmt_loop, *outer_loop = NULL;
3487   tree def_arg;
3488   gimple def_arg_stmt;
3489
3490   if (nested_in_vect_loop_p (loop, stmt))
3491     {
3492       outer_loop = loop;
3493       loop = loop->inner;
3494       nested_cycle = true;
3495     }
3496
3497   gcc_assert (ncopies >= 1);
3498
3499   /* FORNOW: SLP not supported.  */
3500   if (STMT_SLP_TYPE (stmt_info))
3501     return false;
3502
3503   /* 1. Is vectorizable reduction?  */
3504   /* Not supportable if the reduction variable is used in the loop.  */
3505   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
3506     return false;
3507
3508   /* Reductions that are not used even in an enclosing outer-loop,
3509      are expected to be "live" (used out of the loop).  */
3510   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
3511       && !STMT_VINFO_LIVE_P (stmt_info))
3512     return false;
3513
3514   /* Make sure it was already recognized as a reduction computation.  */
3515   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
3516       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
3517     return false;
3518
3519   /* 2. Has this been recognized as a reduction pattern?
3520
3521      Check if STMT represents a pattern that has been recognized
3522      in earlier analysis stages.  For stmts that represent a pattern,
3523      the STMT_VINFO_RELATED_STMT field records the last stmt in
3524      the original sequence that constitutes the pattern.  */
3525
3526   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3527   if (orig_stmt)
3528     {
3529       orig_stmt_info = vinfo_for_stmt (orig_stmt);
3530       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
3531       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
3532       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
3533     }
3534
3535   /* 3. Check the operands of the operation. The first operands are defined
3536         inside the loop body. The last operand is the reduction variable,
3537         which is defined by the loop-header-phi.  */
3538
3539   gcc_assert (is_gimple_assign (stmt));
3540
3541   /* Flatten RHS */
3542   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3543     {
3544     case GIMPLE_SINGLE_RHS:
3545       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
3546       if (op_type == ternary_op)
3547         {
3548           tree rhs = gimple_assign_rhs1 (stmt);
3549           ops[0] = TREE_OPERAND (rhs, 0);
3550           ops[1] = TREE_OPERAND (rhs, 1);
3551           ops[2] = TREE_OPERAND (rhs, 2);
3552           code = TREE_CODE (rhs);
3553         }
3554       else
3555         return false;
3556       break;
3557
3558     case GIMPLE_BINARY_RHS:
3559       code = gimple_assign_rhs_code (stmt);
3560       op_type = TREE_CODE_LENGTH (code);
3561       gcc_assert (op_type == binary_op);
3562       ops[0] = gimple_assign_rhs1 (stmt);
3563       ops[1] = gimple_assign_rhs2 (stmt);
3564       break;
3565
3566     case GIMPLE_UNARY_RHS:
3567       return false;
3568
3569     default:
3570       gcc_unreachable ();
3571     }
3572
3573   scalar_dest = gimple_assign_lhs (stmt);
3574   scalar_type = TREE_TYPE (scalar_dest);
3575   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
3576       && !SCALAR_FLOAT_TYPE_P (scalar_type))
3577     return false;
3578
3579   /* All uses but the last are expected to be defined in the loop.
3580      The last use is the reduction variable. In case of nested cycle this
3581      assumption is not true: we use reduc_index to record the index of the
3582      reduction variable.  */
3583   for (i = 0; i < op_type-1; i++)
3584     {
3585       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
3586       if (i == 0 && code == COND_EXPR)
3587         continue;
3588
3589       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
3590                                           &def, &dt);
3591       gcc_assert (is_simple_use);
3592       if (dt != vect_internal_def
3593           && dt != vect_external_def
3594           && dt != vect_constant_def
3595           && dt != vect_induction_def
3596           && !(dt == vect_nested_cycle && nested_cycle))
3597         return false;
3598
3599       if (dt == vect_nested_cycle)
3600         {
3601           found_nested_cycle_def = true;
3602           reduc_def_stmt = def_stmt;
3603           reduc_index = i;
3604         }
3605     }
3606
3607   is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
3608                                       &def, &dt);
3609   gcc_assert (is_simple_use);
3610   gcc_assert (dt == vect_reduction_def
3611               || dt == vect_nested_cycle
3612               || ((dt == vect_internal_def || dt == vect_external_def
3613                    || dt == vect_constant_def || dt == vect_induction_def)
3614                    && nested_cycle && found_nested_cycle_def));
3615   if (!found_nested_cycle_def)
3616     reduc_def_stmt = def_stmt;
3617
3618   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
3619   if (orig_stmt)
3620     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
3621                                                        reduc_def_stmt,
3622                                                        !nested_cycle,
3623                                                        &dummy));
3624   else
3625     gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
3626                                                   !nested_cycle, &dummy));
3627
3628   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
3629     return false;
3630
3631   vec_mode = TYPE_MODE (vectype);
3632
3633   if (code == COND_EXPR)
3634     {
3635       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0))
3636         {
3637           if (vect_print_dump_info (REPORT_DETAILS))
3638             fprintf (vect_dump, "unsupported condition in reduction");
3639
3640             return false;
3641         }
3642     }
3643   else
3644     {
3645       /* 4. Supportable by target?  */
3646
3647       /* 4.1. check support for the operation in the loop  */
3648       optab = optab_for_tree_code (code, vectype, optab_default);
3649       if (!optab)
3650         {
3651           if (vect_print_dump_info (REPORT_DETAILS))
3652             fprintf (vect_dump, "no optab.");
3653
3654           return false;
3655         }
3656
3657       if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
3658         {
3659           if (vect_print_dump_info (REPORT_DETAILS))
3660             fprintf (vect_dump, "op not supported by target.");
3661
3662           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
3663               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3664                   < vect_min_worthwhile_factor (code))
3665             return false;
3666
3667           if (vect_print_dump_info (REPORT_DETAILS))
3668             fprintf (vect_dump, "proceeding using word mode.");
3669         }
3670
3671       /* Worthwhile without SIMD support?  */
3672       if (!VECTOR_MODE_P (TYPE_MODE (vectype))
3673           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3674              < vect_min_worthwhile_factor (code))
3675         {
3676           if (vect_print_dump_info (REPORT_DETAILS))
3677             fprintf (vect_dump, "not worthwhile without SIMD support.");
3678
3679           return false;
3680         }
3681     }
3682
3683   /* 4.2. Check support for the epilog operation.
3684
3685           If STMT represents a reduction pattern, then the type of the
3686           reduction variable may be different than the type of the rest
3687           of the arguments.  For example, consider the case of accumulation
3688           of shorts into an int accumulator; The original code:
3689                         S1: int_a = (int) short_a;
3690           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
3691
3692           was replaced with:
3693                         STMT: int_acc = widen_sum <short_a, int_acc>
3694
3695           This means that:
3696           1. The tree-code that is used to create the vector operation in the
3697              epilog code (that reduces the partial results) is not the
3698              tree-code of STMT, but is rather the tree-code of the original
3699              stmt from the pattern that STMT is replacing. I.e, in the example
3700              above we want to use 'widen_sum' in the loop, but 'plus' in the
3701              epilog.
3702           2. The type (mode) we use to check available target support
3703              for the vector operation to be created in the *epilog*, is
3704              determined by the type of the reduction variable (in the example
3705              above we'd check this: plus_optab[vect_int_mode]).
3706              However the type (mode) we use to check available target support
3707              for the vector operation to be created *inside the loop*, is
3708              determined by the type of the other arguments to STMT (in the
3709              example we'd check this: widen_sum_optab[vect_short_mode]).
3710
3711           This is contrary to "regular" reductions, in which the types of all
3712           the arguments are the same as the type of the reduction variable.
3713           For "regular" reductions we can therefore use the same vector type
3714           (and also the same tree-code) when generating the epilog code and
3715           when generating the code inside the loop.  */
3716
3717   if (orig_stmt)
3718     {
3719       /* This is a reduction pattern: get the vectype from the type of the
3720          reduction variable, and get the tree-code from orig_stmt.  */
3721       orig_code = gimple_assign_rhs_code (orig_stmt);
3722       vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
3723       if (!vectype)
3724         {
3725           if (vect_print_dump_info (REPORT_DETAILS))
3726             {
3727               fprintf (vect_dump, "unsupported data-type ");
3728               print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
3729             }
3730           return false;
3731         }
3732
3733       vec_mode = TYPE_MODE (vectype);
3734     }
3735   else
3736     {
3737       /* Regular reduction: use the same vectype and tree-code as used for
3738          the vector code inside the loop can be used for the epilog code. */
3739       orig_code = code;
3740     }
3741
3742   if (nested_cycle)
3743     {
3744       def_bb = gimple_bb (reduc_def_stmt);
3745       def_stmt_loop = def_bb->loop_father;
3746       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
3747                                        loop_preheader_edge (def_stmt_loop));
3748       if (TREE_CODE (def_arg) == SSA_NAME
3749           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
3750           && gimple_code (def_arg_stmt) == GIMPLE_PHI
3751           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
3752           && vinfo_for_stmt (def_arg_stmt)
3753           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
3754               == vect_double_reduction_def)
3755         double_reduc = true;
3756     }
3757
3758   epilog_reduc_code = ERROR_MARK;
3759   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
3760     {
3761       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype,
3762                                          optab_default);
3763       if (!reduc_optab)
3764         {
3765           if (vect_print_dump_info (REPORT_DETAILS))
3766             fprintf (vect_dump, "no optab for reduction.");
3767
3768           epilog_reduc_code = ERROR_MARK;
3769         }
3770
3771       if (reduc_optab
3772           && optab_handler (reduc_optab, vec_mode)->insn_code
3773               == CODE_FOR_nothing)
3774         {
3775           if (vect_print_dump_info (REPORT_DETAILS))
3776             fprintf (vect_dump, "reduc op not supported by target.");
3777
3778           epilog_reduc_code = ERROR_MARK;
3779         }
3780     }
3781   else
3782     {
3783       if (!nested_cycle || double_reduc)
3784         {
3785           if (vect_print_dump_info (REPORT_DETAILS))
3786             fprintf (vect_dump, "no reduc code for scalar code.");
3787
3788           return false;
3789         }
3790     }
3791
3792   if (double_reduc && ncopies > 1)
3793     {
3794       if (vect_print_dump_info (REPORT_DETAILS))
3795         fprintf (vect_dump, "multiple types in double reduction");
3796
3797       return false;
3798     }
3799
3800   if (!vec_stmt) /* transformation not required.  */
3801     {
3802       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3803       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
3804         return false;
3805       return true;
3806     }
3807
3808   /** Transform.  **/
3809
3810   if (vect_print_dump_info (REPORT_DETAILS))
3811     fprintf (vect_dump, "transform reduction.");
3812
3813   /* FORNOW: Multiple types are not supported for condition.  */
3814   if (code == COND_EXPR)
3815     gcc_assert (ncopies == 1);
3816
3817   /* Create the destination vector  */
3818   vec_dest = vect_create_destination_var (scalar_dest, vectype);
3819
3820   /* In case the vectorization factor (VF) is bigger than the number
3821      of elements that we can fit in a vectype (nunits), we have to generate
3822      more than one vector stmt - i.e - we need to "unroll" the
3823      vector stmt by a factor VF/nunits.  For more details see documentation
3824      in vectorizable_operation.  */
3825
3826   /* If the reduction is used in an outer loop we need to generate
3827      VF intermediate results, like so (e.g. for ncopies=2):
3828         r0 = phi (init, r0)
3829         r1 = phi (init, r1)
3830         r0 = x0 + r0;
3831         r1 = x1 + r1;
3832     (i.e. we generate VF results in 2 registers).
3833     In this case we have a separate def-use cycle for each copy, and therefore
3834     for each copy we get the vector def for the reduction variable from the
3835     respective phi node created for this copy.
3836
3837     Otherwise (the reduction is unused in the loop nest), we can combine
3838     together intermediate results, like so (e.g. for ncopies=2):
3839         r = phi (init, r)
3840         r = x0 + r;
3841         r = x1 + r;
3842    (i.e. we generate VF/2 results in a single register).
3843    In this case for each copy we get the vector def for the reduction variable
3844    from the vectorized reduction operation generated in the previous iteration.
3845   */
3846
3847   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
3848     {
3849       single_defuse_cycle = true;
3850       epilog_copies = 1;
3851     }
3852   else
3853     epilog_copies = ncopies;
3854
3855   prev_stmt_info = NULL;
3856   prev_phi_info = NULL;
3857   for (j = 0; j < ncopies; j++)
3858     {
3859       if (j == 0 || !single_defuse_cycle)
3860         {
3861           /* Create the reduction-phi that defines the reduction-operand.  */
3862           new_phi = create_phi_node (vec_dest, loop->header);
3863           set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo,
3864                                                           NULL));
3865           /* Get the vector def for the reduction variable from the phi
3866              node.  */
3867           reduc_def = PHI_RESULT (new_phi);
3868         }
3869
3870       if (code == COND_EXPR)
3871         {
3872           first_phi = new_phi;
3873           vectorizable_condition (stmt, gsi, vec_stmt, reduc_def, reduc_index);
3874           /* Multiple types are not supported for condition.  */
3875           break;
3876         }
3877
3878       /* Handle uses.  */
3879       if (j == 0)
3880         {
3881           loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
3882                                                         stmt, NULL);
3883           if (op_type == ternary_op)
3884             {
3885               if (reduc_index == 0)
3886                 loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt,
3887                                                               NULL);
3888               else
3889                 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt,
3890                                                               NULL);
3891             }
3892
3893           /* Get the vector def for the reduction variable from the phi
3894              node.  */
3895           first_phi = new_phi;
3896         }
3897       else
3898         {
3899           enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
3900           loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
3901           if (op_type == ternary_op)
3902             loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
3903
3904           if (single_defuse_cycle)
3905             reduc_def = gimple_assign_lhs (new_stmt);
3906           else
3907             reduc_def = PHI_RESULT (new_phi);
3908
3909           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
3910         }
3911
3912       /* Arguments are ready. Create the new vector stmt.  */
3913       if (op_type == binary_op)
3914         {
3915           if (reduc_index == 0)
3916             expr = build2 (code, vectype, reduc_def, loop_vec_def0);
3917           else
3918             expr = build2 (code, vectype, loop_vec_def0, reduc_def);
3919         }
3920       else
3921         {
3922           if (reduc_index == 0)
3923             expr = build3 (code, vectype, reduc_def, loop_vec_def0,
3924                            loop_vec_def1);
3925           else
3926             {
3927               if (reduc_index == 1)
3928                 expr = build3 (code, vectype, loop_vec_def0, reduc_def,
3929                                loop_vec_def1);
3930               else
3931                 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
3932                                reduc_def);
3933             }
3934         }
3935
3936       new_stmt = gimple_build_assign (vec_dest, expr);
3937       new_temp = make_ssa_name (vec_dest, new_stmt);
3938       gimple_assign_set_lhs (new_stmt, new_temp);
3939       vect_finish_stmt_generation (stmt, new_stmt, gsi);
3940
3941       if (j == 0)
3942         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3943       else
3944         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3945
3946       prev_stmt_info = vinfo_for_stmt (new_stmt);
3947       prev_phi_info = vinfo_for_stmt (new_phi);
3948     }
3949
3950   /* Finalize the reduction-phi (set its arguments) and create the
3951      epilog reduction code.  */
3952   if (!single_defuse_cycle || code == COND_EXPR)
3953     new_temp = gimple_assign_lhs (*vec_stmt);
3954
3955   vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies,
3956                                     epilog_reduc_code, first_phi, reduc_index,
3957                                     double_reduc);
3958   return true;
3959 }
3960
3961 /* Function vect_min_worthwhile_factor.
3962
3963    For a loop where we could vectorize the operation indicated by CODE,
3964    return the minimum vectorization factor that makes it worthwhile
3965    to use generic vectors.  */
3966 int
3967 vect_min_worthwhile_factor (enum tree_code code)
3968 {
3969   switch (code)
3970     {
3971     case PLUS_EXPR:
3972     case MINUS_EXPR:
3973     case NEGATE_EXPR:
3974       return 4;
3975
3976     case BIT_AND_EXPR:
3977     case BIT_IOR_EXPR:
3978     case BIT_XOR_EXPR:
3979     case BIT_NOT_EXPR:
3980       return 2;
3981
3982     default:
3983       return INT_MAX;
3984     }
3985 }
3986
3987
3988 /* Function vectorizable_induction
3989
3990    Check if PHI performs an induction computation that can be vectorized.
3991    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3992    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3993    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
3994
3995 bool
3996 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
3997                         gimple *vec_stmt)
3998 {
3999   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
4000   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4001   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4002   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4003   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4004   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4005   tree vec_def;
4006
4007   gcc_assert (ncopies >= 1);
4008   /* FORNOW. This restriction should be relaxed.  */
4009   if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
4010     {
4011       if (vect_print_dump_info (REPORT_DETAILS))
4012         fprintf (vect_dump, "multiple types in nested loop.");
4013       return false;
4014     }
4015
4016   if (!STMT_VINFO_RELEVANT_P (stmt_info))
4017     return false;
4018
4019   /* FORNOW: SLP not supported.  */
4020   if (STMT_SLP_TYPE (stmt_info))
4021     return false;
4022
4023   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
4024
4025   if (gimple_code (phi) != GIMPLE_PHI)
4026     return false;
4027
4028   if (!vec_stmt) /* transformation not required.  */
4029     {
4030       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
4031       if (vect_print_dump_info (REPORT_DETAILS))
4032         fprintf (vect_dump, "=== vectorizable_induction ===");
4033       vect_model_induction_cost (stmt_info, ncopies);
4034       return true;
4035     }
4036
4037   /** Transform.  **/
4038
4039   if (vect_print_dump_info (REPORT_DETAILS))
4040     fprintf (vect_dump, "transform induction phi.");
4041
4042   vec_def = get_initial_def_for_induction (phi);
4043   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
4044   return true;
4045 }
4046
4047 /* Function vectorizable_live_operation.
4048
4049    STMT computes a value that is used outside the loop. Check if
4050    it can be supported.  */
4051
4052 bool
4053 vectorizable_live_operation (gimple stmt,
4054                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4055                              gimple *vec_stmt ATTRIBUTE_UNUSED)
4056 {
4057   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4058   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4059   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4060   int i;
4061   int op_type;
4062   tree op;
4063   tree def;
4064   gimple def_stmt;
4065   enum vect_def_type dt;
4066   enum tree_code code;
4067   enum gimple_rhs_class rhs_class;
4068
4069   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
4070
4071   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
4072     return false;
4073
4074   if (!is_gimple_assign (stmt))
4075     return false;
4076
4077   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4078     return false;
4079
4080   /* FORNOW. CHECKME. */
4081   if (nested_in_vect_loop_p (loop, stmt))
4082     return false;
4083
4084   code = gimple_assign_rhs_code (stmt);
4085   op_type = TREE_CODE_LENGTH (code);
4086   rhs_class = get_gimple_rhs_class (code);
4087   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
4088   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
4089
4090   /* FORNOW: support only if all uses are invariant. This means
4091      that the scalar operations can remain in place, unvectorized.
4092      The original last scalar value that they compute will be used.  */
4093
4094   for (i = 0; i < op_type; i++)
4095     {
4096       if (rhs_class == GIMPLE_SINGLE_RHS)
4097         op = TREE_OPERAND (gimple_op (stmt, 1), i);
4098       else
4099         op = gimple_op (stmt, i + 1);
4100       if (op
4101           && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
4102         {
4103           if (vect_print_dump_info (REPORT_DETAILS))
4104             fprintf (vect_dump, "use not simple.");
4105           return false;
4106         }
4107
4108       if (dt != vect_external_def && dt != vect_constant_def)
4109         return false;
4110     }
4111
4112   /* No transformation is required for the cases we currently support.  */
4113   return true;
4114 }
4115
4116 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
4117
4118 static void
4119 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
4120 {
4121   ssa_op_iter op_iter;
4122   imm_use_iterator imm_iter;
4123   def_operand_p def_p;
4124   gimple ustmt;
4125
4126   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
4127     {
4128       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
4129         {
4130           basic_block bb;
4131
4132           if (!is_gimple_debug (ustmt))
4133             continue;
4134
4135           bb = gimple_bb (ustmt);
4136
4137           if (!flow_bb_inside_loop_p (loop, bb))
4138             {
4139               if (gimple_debug_bind_p (ustmt))
4140                 {
4141                   if (vect_print_dump_info (REPORT_DETAILS))
4142                     fprintf (vect_dump, "killing debug use");
4143
4144                   gimple_debug_bind_reset_value (ustmt);
4145                   update_stmt (ustmt);
4146                 }
4147               else
4148                 gcc_unreachable ();
4149             }
4150         }
4151     }
4152 }
4153
4154 /* Function vect_transform_loop.
4155
4156    The analysis phase has determined that the loop is vectorizable.
4157    Vectorize the loop - created vectorized stmts to replace the scalar
4158    stmts in the loop, and update the loop exit condition.  */
4159
4160 void
4161 vect_transform_loop (loop_vec_info loop_vinfo)
4162 {
4163   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4164   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4165   int nbbs = loop->num_nodes;
4166   gimple_stmt_iterator si;
4167   int i;
4168   tree ratio = NULL;
4169   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4170   bool strided_store;
4171   bool slp_scheduled = false;
4172   unsigned int nunits;
4173   tree cond_expr = NULL_TREE;
4174   gimple_seq cond_expr_stmt_list = NULL;
4175   bool do_peeling_for_loop_bound;
4176
4177   if (vect_print_dump_info (REPORT_DETAILS))
4178     fprintf (vect_dump, "=== vec_transform_loop ===");
4179
4180   /* Peel the loop if there are data refs with unknown alignment.
4181      Only one data ref with unknown store is allowed.  */
4182
4183   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4184     vect_do_peeling_for_alignment (loop_vinfo);
4185
4186   do_peeling_for_loop_bound
4187     = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4188        || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4189            && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0));
4190
4191   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
4192       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4193     vect_loop_versioning (loop_vinfo,
4194                           !do_peeling_for_loop_bound,
4195                           &cond_expr, &cond_expr_stmt_list);
4196
4197   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4198      compile time constant), or it is a constant that doesn't divide by the
4199      vectorization factor, then an epilog loop needs to be created.
4200      We therefore duplicate the loop: the original loop will be vectorized,
4201      and will compute the first (n/VF) iterations. The second copy of the loop
4202      will remain scalar and will compute the remaining (n%VF) iterations.
4203      (VF is the vectorization factor).  */
4204
4205   if (do_peeling_for_loop_bound)
4206     vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
4207                                     cond_expr, cond_expr_stmt_list);
4208   else
4209     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
4210                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
4211
4212   /* 1) Make sure the loop header has exactly two entries
4213      2) Make sure we have a preheader basic block.  */
4214
4215   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
4216
4217   split_edge (loop_preheader_edge (loop));
4218
4219   /* FORNOW: the vectorizer supports only loops which body consist
4220      of one basic block (header + empty latch). When the vectorizer will
4221      support more involved loop forms, the order by which the BBs are
4222      traversed need to be reconsidered.  */
4223
4224   for (i = 0; i < nbbs; i++)
4225     {
4226       basic_block bb = bbs[i];
4227       stmt_vec_info stmt_info;
4228       gimple phi;
4229
4230       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
4231         {
4232           phi = gsi_stmt (si);
4233           if (vect_print_dump_info (REPORT_DETAILS))
4234             {
4235               fprintf (vect_dump, "------>vectorizing phi: ");
4236               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
4237             }
4238           stmt_info = vinfo_for_stmt (phi);
4239           if (!stmt_info)
4240             continue;
4241
4242           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4243             vect_loop_kill_debug_uses (loop, phi);
4244
4245           if (!STMT_VINFO_RELEVANT_P (stmt_info)
4246               && !STMT_VINFO_LIVE_P (stmt_info))
4247             continue;
4248
4249           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
4250                 != (unsigned HOST_WIDE_INT) vectorization_factor)
4251               && vect_print_dump_info (REPORT_DETAILS))
4252             fprintf (vect_dump, "multiple-types.");
4253
4254           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
4255             {
4256               if (vect_print_dump_info (REPORT_DETAILS))
4257                 fprintf (vect_dump, "transform phi.");
4258               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
4259             }
4260         }
4261
4262       for (si = gsi_start_bb (bb); !gsi_end_p (si);)
4263         {
4264           gimple stmt = gsi_stmt (si);
4265           bool is_store;
4266
4267           if (vect_print_dump_info (REPORT_DETAILS))
4268             {
4269               fprintf (vect_dump, "------>vectorizing statement: ");
4270               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
4271             }
4272
4273           stmt_info = vinfo_for_stmt (stmt);
4274
4275           /* vector stmts created in the outer-loop during vectorization of
4276              stmts in an inner-loop may not have a stmt_info, and do not
4277              need to be vectorized.  */
4278           if (!stmt_info)
4279             {
4280               gsi_next (&si);
4281               continue;
4282             }
4283
4284           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4285             vect_loop_kill_debug_uses (loop, stmt);
4286
4287           if (!STMT_VINFO_RELEVANT_P (stmt_info)
4288               && !STMT_VINFO_LIVE_P (stmt_info))
4289             {
4290               gsi_next (&si);
4291               continue;
4292             }
4293
4294           gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
4295           nunits =
4296             (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4297           if (!STMT_SLP_TYPE (stmt_info)
4298               && nunits != (unsigned int) vectorization_factor
4299               && vect_print_dump_info (REPORT_DETAILS))
4300             /* For SLP VF is set according to unrolling factor, and not to
4301                vector size, hence for SLP this print is not valid.  */
4302             fprintf (vect_dump, "multiple-types.");
4303
4304           /* SLP. Schedule all the SLP instances when the first SLP stmt is
4305              reached.  */
4306           if (STMT_SLP_TYPE (stmt_info))
4307             {
4308               if (!slp_scheduled)
4309                 {
4310                   slp_scheduled = true;
4311
4312                   if (vect_print_dump_info (REPORT_DETAILS))
4313                     fprintf (vect_dump, "=== scheduling SLP instances ===");
4314
4315                   vect_schedule_slp (loop_vinfo, NULL);
4316                 }
4317
4318               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
4319               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
4320                 {
4321                   gsi_next (&si);
4322                   continue;
4323                 }
4324             }
4325
4326           /* -------- vectorize statement ------------ */
4327           if (vect_print_dump_info (REPORT_DETAILS))
4328             fprintf (vect_dump, "transform statement.");
4329
4330           strided_store = false;
4331           is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
4332           if (is_store)
4333             {
4334               if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4335                 {
4336                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
4337                      interleaving chain was completed - free all the stores in
4338                      the chain.  */
4339                   vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
4340                   gsi_remove (&si, true);
4341                   continue;
4342                 }
4343               else
4344                 {
4345                   /* Free the attached stmt_vec_info and remove the stmt.  */
4346                   free_stmt_vec_info (stmt);
4347                   gsi_remove (&si, true);
4348                   continue;
4349                 }
4350             }
4351           gsi_next (&si);
4352         }                       /* stmts in BB */
4353     }                           /* BBs in loop */
4354
4355   slpeel_make_loop_iterate_ntimes (loop, ratio);
4356
4357   /* The memory tags and pointers in vectorized statements need to
4358      have their SSA forms updated.  FIXME, why can't this be delayed
4359      until all the loops have been transformed?  */
4360   update_ssa (TODO_update_ssa);
4361
4362   if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4363     fprintf (vect_dump, "LOOP VECTORIZED.");
4364   if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4365     fprintf (vect_dump, "OUTER LOOP VECTORIZED.");
4366 }