gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Free Software
   3    Foundation, Inc.
   4    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   5    Ira Rosen <irar@il.ibm.com>
   6
   7 This file is part of GCC.
   8
   9 GCC is free software; you can redistribute it and/or modify it under
  10 the terms of the GNU General Public License as published by the Free
  11 Software Foundation; either version 3, or (at your option) any later
  12 version.
  13
  14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  16 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  17 for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with GCC; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "tm.h"
  27 #include "ggc.h"
  28 #include "tree.h"
  29 #include "basic-block.h"
  30 #include "diagnostic.h"
  31 #include "tree-flow.h"
  32 #include "tree-dump.h"
  33 #include "cfgloop.h"
  34 #include "cfglayout.h"
  35 #include "expr.h"
  36 #include "recog.h"
  37 #include "optabs.h"
  38 #include "params.h"
  39 #include "toplev.h"
  40 #include "tree-chrec.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43
  44 /* Loop Vectorization Pass.
  45
  46    This pass tries to vectorize loops.
  47
  48    For example, the vectorizer transforms the following simple loop:
  49
  50         short a[N]; short b[N]; short c[N]; int i;
  51
  52         for (i=0; i<N; i++){
  53           a[i] = b[i] + c[i];
  54         }
  55
  56    as if it was manually vectorized by rewriting the source code into:
  57
  58         typedef int __attribute__((mode(V8HI))) v8hi;
  59         short a[N];  short b[N]; short c[N];   int i;
  60         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  61         v8hi va, vb, vc;
  62
  63         for (i=0; i<N/8; i++){
  64           vb = pb[i];
  65           vc = pc[i];
  66           va = vb + vc;
  67           pa[i] = va;
  68         }
  69
  70         The main entry to this pass is vectorize_loops(), in which
  71    the vectorizer applies a set of analyses on a given set of loops,
  72    followed by the actual vectorization transformation for the loops that
  73    had successfully passed the analysis phase.
  74         Throughout this pass we make a distinction between two types of
  75    data: scalars (which are represented by SSA_NAMES), and memory references
  76    ("data-refs"). These two types of data require different handling both
  77    during analysis and transformation. The types of data-refs that the
  78    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  79    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  80    accesses are required to have a simple (consecutive) access pattern.
  81
  82    Analysis phase:
  83    ===============
  84         The driver for the analysis phase is vect_analyze_loop().
  85    It applies a set of analyses, some of which rely on the scalar evolution
  86    analyzer (scev) developed by Sebastian Pop.
  87
  88         During the analysis phase the vectorizer records some information
  89    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  90    loop, as well as general information about the loop as a whole, which is
  91    recorded in a "loop_vec_info" struct attached to each loop.
  92
  93    Transformation phase:
  94    =====================
  95         The loop transformation phase scans all the stmts in the loop, and
  96    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
  97    the loop that needs to be vectorized. It inserts the vector code sequence
  98    just before the scalar stmt S, and records a pointer to the vector code
  99    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 100    attached to S). This pointer will be used for the vectorization of following
 101    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 102    otherwise, we rely on dead code elimination for removing it.
 103
 104         For example, say stmt S1 was vectorized into stmt VS1:
 105
 106    VS1: vb = px[i];
 107    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 108    S2:  a = b;
 109
 110    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 111    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 112    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
 113    resulting sequence would be:
 114
 115    VS1: vb = px[i];
 116    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 117    VS2: va = vb;
 118    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 119
 120         Operands that are not SSA_NAMEs, are data-refs that appear in
 121    load/store operations (like 'x[i]' in S1), and are handled differently.
 122
 123    Target modeling:
 124    =================
 125         Currently the only target specific information that is used is the
 126    size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can
 127    support different sizes of vectors, for now will need to specify one value
 128    for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future.
 129
 130         Since we only vectorize operations which vector form can be
 131    expressed using existing tree codes, to verify that an operation is
 132    supported, the vectorizer checks the relevant optab at the relevant
 133    machine_mode (e.g, optab_handler (add_optab, V8HImode)->insn_code). If
 134    the value found is CODE_FOR_nothing, then there's no target support, and
 135    we can't vectorize the stmt.
 136
 137    For additional information on this project see:
 138    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 139 */
 140
 141 /* Function vect_determine_vectorization_factor
 142
 143    Determine the vectorization factor (VF). VF is the number of data elements
 144    that are operated upon in parallel in a single iteration of the vectorized
 145    loop. For example, when vectorizing a loop that operates on 4byte elements,
 146    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 147    elements can fit in a single vector register.
 148
 149    We currently support vectorization of loops in which all types operated upon
 150    are of the same size. Therefore this function currently sets VF according to
 151    the size of the types operated upon, and fails if there are multiple sizes
 152    in the loop.
 153
 154    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 155    original loop:
 156         for (i=0; i<N; i++){
 157           a[i] = b[i] + c[i];
 158         }
 159
 160    vectorized loop:
 161         for (i=0; i<N; i+=VF){
 162           a[i:VF] = b[i:VF] + c[i:VF];
 163         }
 164 */
 165
 166 static bool
 167 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 168 {
 169   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 170   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 171   int nbbs = loop->num_nodes;
 172   gimple_stmt_iterator si;
 173   unsigned int vectorization_factor = 0;
 174   tree scalar_type;
 175   gimple phi;
 176   tree vectype;
 177   unsigned int nunits;
 178   stmt_vec_info stmt_info;
 179   int i;
 180   HOST_WIDE_INT dummy;
 181
 182   if (vect_print_dump_info (REPORT_DETAILS))
 183     fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
 184
 185   for (i = 0; i < nbbs; i++)
 186     {
 187       basic_block bb = bbs[i];
 188
 189       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 190         {
 191           phi = gsi_stmt (si);
 192           stmt_info = vinfo_for_stmt (phi);
 193           if (vect_print_dump_info (REPORT_DETAILS))
 194             {
 195               fprintf (vect_dump, "==> examining phi: ");
 196               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 197             }
 198
 199           gcc_assert (stmt_info);
 200
 201           if (STMT_VINFO_RELEVANT_P (stmt_info))
 202             {
 203               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 204               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 205
 206               if (vect_print_dump_info (REPORT_DETAILS))
 207                 {
 208                   fprintf (vect_dump, "get vectype for scalar type:  ");
 209                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 210                 }
 211
 212               vectype = get_vectype_for_scalar_type (scalar_type);
 213               if (!vectype)
 214                 {
 215                   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 216                     {
 217                       fprintf (vect_dump,
 218                                "not vectorized: unsupported data-type ");
 219                       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 220                     }
 221                   return false;
 222                 }
 223               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 224
 225               if (vect_print_dump_info (REPORT_DETAILS))
 226                 {
 227                   fprintf (vect_dump, "vectype: ");
 228                   print_generic_expr (vect_dump, vectype, TDF_SLIM);
 229                 }
 230
 231               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 232               if (vect_print_dump_info (REPORT_DETAILS))
 233                 fprintf (vect_dump, "nunits = %d", nunits);
 234
 235               if (!vectorization_factor
 236                   || (nunits > vectorization_factor))
 237                 vectorization_factor = nunits;
 238             }
 239         }
 240
 241       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 242         {
 243           gimple stmt = gsi_stmt (si);
 244           stmt_info = vinfo_for_stmt (stmt);
 245
 246           if (vect_print_dump_info (REPORT_DETAILS))
 247             {
 248               fprintf (vect_dump, "==> examining statement: ");
 249               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 250             }
 251
 252           gcc_assert (stmt_info);
 253
 254           /* skip stmts which do not need to be vectorized.  */
 255           if (!STMT_VINFO_RELEVANT_P (stmt_info)
 256               && !STMT_VINFO_LIVE_P (stmt_info))
 257             {
 258               if (vect_print_dump_info (REPORT_DETAILS))
 259                 fprintf (vect_dump, "skip.");
 260               continue;
 261             }
 262
 263           if (gimple_get_lhs (stmt) == NULL_TREE)
 264             {
 265               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 266                 {
 267                   fprintf (vect_dump, "not vectorized: irregular stmt.");
 268                   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 269                 }
 270               return false;
 271             }
 272
 273           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 274             {
 275               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 276                 {
 277                   fprintf (vect_dump, "not vectorized: vector stmt in loop:");
 278                   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 279                 }
 280               return false;
 281             }
 282
 283           if (STMT_VINFO_VECTYPE (stmt_info))
 284             {
 285               /* The only case when a vectype had been already set is for stmts
 286                  that contain a dataref, or for "pattern-stmts" (stmts generated
 287                  by the vectorizer to represent/replace a certain idiom).  */
 288               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 289                           || is_pattern_stmt_p (stmt_info));
 290               vectype = STMT_VINFO_VECTYPE (stmt_info);
 291             }
 292           else
 293             {
 294               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
 295                           && !is_pattern_stmt_p (stmt_info));
 296
 297               scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 298                                                            &dummy);
 299               if (vect_print_dump_info (REPORT_DETAILS))
 300                 {
 301                   fprintf (vect_dump, "get vectype for scalar type:  ");
 302                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 303                 }
 304
 305               vectype = get_vectype_for_scalar_type (scalar_type);
 306               if (!vectype)
 307                 {
 308                   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 309                     {
 310                       fprintf (vect_dump,
 311                                "not vectorized: unsupported data-type ");
 312                       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 313                     }
 314                   return false;
 315                 }
 316               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 317             }
 318
 319           if (vect_print_dump_info (REPORT_DETAILS))
 320             {
 321               fprintf (vect_dump, "vectype: ");
 322               print_generic_expr (vect_dump, vectype, TDF_SLIM);
 323             }
 324
 325           nunits = TYPE_VECTOR_SUBPARTS (vectype);
 326           if (vect_print_dump_info (REPORT_DETAILS))
 327             fprintf (vect_dump, "nunits = %d", nunits);
 328
 329           if (!vectorization_factor
 330               || (nunits > vectorization_factor))
 331             vectorization_factor = nunits;
 332
 333         }
 334     }
 335
 336   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 337   if (vect_print_dump_info (REPORT_DETAILS))
 338     fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
 339   if (vectorization_factor <= 1)
 340     {
 341       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 342         fprintf (vect_dump, "not vectorized: unsupported data-type");
 343       return false;
 344     }
 345   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 346
 347   return true;
 348 }
 349
 350
 351 /* Function vect_is_simple_iv_evolution.
 352
 353    FORNOW: A simple evolution of an induction variables in the loop is
 354    considered a polynomial evolution with constant step.  */
 355
 356 static bool
 357 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 358                              tree * step)
 359 {
 360   tree init_expr;
 361   tree step_expr;
 362   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 363
 364   /* When there is no evolution in this loop, the evolution function
 365      is not "simple".  */
 366   if (evolution_part == NULL_TREE)
 367     return false;
 368
 369   /* When the evolution is a polynomial of degree >= 2
 370      the evolution function is not "simple".  */
 371   if (tree_is_chrec (evolution_part))
 372     return false;
 373
 374   step_expr = evolution_part;
 375   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 376
 377   if (vect_print_dump_info (REPORT_DETAILS))
 378     {
 379       fprintf (vect_dump, "step: ");
 380       print_generic_expr (vect_dump, step_expr, TDF_SLIM);
 381       fprintf (vect_dump, ",  init: ");
 382       print_generic_expr (vect_dump, init_expr, TDF_SLIM);
 383     }
 384
 385   *init = init_expr;
 386   *step = step_expr;
 387
 388   if (TREE_CODE (step_expr) != INTEGER_CST)
 389     {
 390       if (vect_print_dump_info (REPORT_DETAILS))
 391         fprintf (vect_dump, "step unknown.");
 392       return false;
 393     }
 394
 395   return true;
 396 }
 397
 398 /* Function vect_analyze_scalar_cycles_1.
 399
 400    Examine the cross iteration def-use cycles of scalar variables
 401    in LOOP. LOOP_VINFO represents the loop that is now being
 402    considered for vectorization (can be LOOP, or an outer-loop
 403    enclosing LOOP).  */
 404
 405 static void
 406 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 407 {
 408   basic_block bb = loop->header;
 409   tree dumy;
 410   VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
 411   gimple_stmt_iterator gsi;
 412   bool double_reduc;
 413
 414   if (vect_print_dump_info (REPORT_DETAILS))
 415     fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
 416
 417   /* First - identify all inductions. Reduction detection assumes that all the
 418      inductions have been identified, therefore, this order must not be
 419      changed.  */
 420   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 421     {
 422       gimple phi = gsi_stmt (gsi);
 423       tree access_fn = NULL;
 424       tree def = PHI_RESULT (phi);
 425       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 426
 427       if (vect_print_dump_info (REPORT_DETAILS))
 428         {
 429           fprintf (vect_dump, "Analyze phi: ");
 430           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 431         }
 432
 433       /* Skip virtual phi's. The data dependences that are associated with
 434          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 435       if (!is_gimple_reg (SSA_NAME_VAR (def)))
 436         continue;
 437
 438       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 439
 440       /* Analyze the evolution function.  */
 441       access_fn = analyze_scalar_evolution (loop, def);
 442       if (access_fn && vect_print_dump_info (REPORT_DETAILS))
 443         {
 444           fprintf (vect_dump, "Access function of PHI: ");
 445           print_generic_expr (vect_dump, access_fn, TDF_SLIM);
 446         }
 447
 448       if (!access_fn
 449           || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
 450         {
 451           VEC_safe_push (gimple, heap, worklist, phi);
 452           continue;
 453         }
 454
 455       if (vect_print_dump_info (REPORT_DETAILS))
 456         fprintf (vect_dump, "Detected induction.");
 457       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 458     }
 459
 460
 461   /* Second - identify all reductions and nested cycles.  */
 462   while (VEC_length (gimple, worklist) > 0)
 463     {
 464       gimple phi = VEC_pop (gimple, worklist);
 465       tree def = PHI_RESULT (phi);
 466       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 467       gimple reduc_stmt;
 468       bool nested_cycle;
 469
 470       if (vect_print_dump_info (REPORT_DETAILS))
 471         {
 472           fprintf (vect_dump, "Analyze phi: ");
 473           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 474         }
 475
 476       gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
 477       gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 478
 479       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 480       reduc_stmt = vect_is_simple_reduction (loop_vinfo, phi, !nested_cycle,
 481                                              &double_reduc);
 482       if (reduc_stmt)
 483         {
 484           if (double_reduc)
 485             {
 486               if (vect_print_dump_info (REPORT_DETAILS))
 487                 fprintf (vect_dump, "Detected double reduction.");
 488
 489               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 490               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 491                                                     vect_double_reduction_def;
 492             }
 493           else
 494             {
 495               if (nested_cycle)
 496                 {
 497                   if (vect_print_dump_info (REPORT_DETAILS))
 498                     fprintf (vect_dump, "Detected vectorizable nested cycle.");
 499
 500                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 501                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 502                                                              vect_nested_cycle;
 503                 }
 504               else
 505                 {
 506                   if (vect_print_dump_info (REPORT_DETAILS))
 507                     fprintf (vect_dump, "Detected reduction.");
 508
 509                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 510                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 511                                                            vect_reduction_def;
 512                 }
 513             }
 514         }
 515       else
 516         if (vect_print_dump_info (REPORT_DETAILS))
 517           fprintf (vect_dump, "Unknown def-use cycle pattern.");
 518     }
 519
 520   VEC_free (gimple, heap, worklist);
 521 }
 522
 523
 524 /* Function vect_analyze_scalar_cycles.
 525
 526    Examine the cross iteration def-use cycles of scalar variables, by
 527    analyzing the loop-header PHIs of scalar variables; Classify each
 528    cycle as one of the following: invariant, induction, reduction, unknown.
 529    We do that for the loop represented by LOOP_VINFO, and also to its
 530    inner-loop, if exists.
 531    Examples for scalar cycles:
 532
 533    Example1: reduction:
 534
 535               loop1:
 536               for (i=0; i<N; i++)
 537                  sum += a[i];
 538
 539    Example2: induction:
 540
 541               loop2:
 542               for (i=0; i<N; i++)
 543                  a[i] = i;  */
 544
 545 static void
 546 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 547 {
 548   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 549
 550   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 551
 552   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 553      Reductions in such inner-loop therefore have different properties than
 554      the reductions in the nest that gets vectorized:
 555      1. When vectorized, they are executed in the same order as in the original
 556         scalar loop, so we can't change the order of computation when
 557         vectorizing them.
 558      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 559         current checks are too strict.  */
 560
 561   if (loop->inner)
 562     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 563 }
 564
 565 /* Function vect_get_loop_niters.
 566
 567    Determine how many iterations the loop is executed.
 568    If an expression that represents the number of iterations
 569    can be constructed, place it in NUMBER_OF_ITERATIONS.
 570    Return the loop exit condition.  */
 571
 572 static gimple
 573 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 574 {
 575   tree niters;
 576
 577   if (vect_print_dump_info (REPORT_DETAILS))
 578     fprintf (vect_dump, "=== get_loop_niters ===");
 579
 580   niters = number_of_exit_cond_executions (loop);
 581
 582   if (niters != NULL_TREE
 583       && niters != chrec_dont_know)
 584     {
 585       *number_of_iterations = niters;
 586
 587       if (vect_print_dump_info (REPORT_DETAILS))
 588         {
 589           fprintf (vect_dump, "==> get_loop_niters:" );
 590           print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
 591         }
 592     }
 593
 594   return get_loop_exit_condition (loop);
 595 }
 596
 597
 598 /* Function bb_in_loop_p
 599
 600    Used as predicate for dfs order traversal of the loop bbs.  */
 601
 602 static bool
 603 bb_in_loop_p (const_basic_block bb, const void *data)
 604 {
 605   const struct loop *const loop = (const struct loop *)data;
 606   if (flow_bb_inside_loop_p (loop, bb))
 607     return true;
 608   return false;
 609 }
 610
 611
 612 /* Function new_loop_vec_info.
 613
 614    Create and initialize a new loop_vec_info struct for LOOP, as well as
 615    stmt_vec_info structs for all the stmts in LOOP.  */
 616
 617 static loop_vec_info
 618 new_loop_vec_info (struct loop *loop)
 619 {
 620   loop_vec_info res;
 621   basic_block *bbs;
 622   gimple_stmt_iterator si;
 623   unsigned int i, nbbs;
 624
 625   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 626   LOOP_VINFO_LOOP (res) = loop;
 627
 628   bbs = get_loop_body (loop);
 629
 630   /* Create/Update stmt_info for all stmts in the loop.  */
 631   for (i = 0; i < loop->num_nodes; i++)
 632     {
 633       basic_block bb = bbs[i];
 634
 635       /* BBs in a nested inner-loop will have been already processed (because
 636          we will have called vect_analyze_loop_form for any nested inner-loop).
 637          Therefore, for stmts in an inner-loop we just want to update the
 638          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 639          loop_info of the outer-loop we are currently considering to vectorize
 640          (instead of the loop_info of the inner-loop).
 641          For stmts in other BBs we need to create a stmt_info from scratch.  */
 642       if (bb->loop_father != loop)
 643         {
 644           /* Inner-loop bb.  */
 645           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 646           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 647             {
 648               gimple phi = gsi_stmt (si);
 649               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 650               loop_vec_info inner_loop_vinfo =
 651                 STMT_VINFO_LOOP_VINFO (stmt_info);
 652               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 653               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 654             }
 655           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 656            {
 657               gimple stmt = gsi_stmt (si);
 658               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 659               loop_vec_info inner_loop_vinfo =
 660                  STMT_VINFO_LOOP_VINFO (stmt_info);
 661               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 662               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 663            }
 664         }
 665       else
 666         {
 667           /* bb in current nest.  */
 668           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 669             {
 670               gimple phi = gsi_stmt (si);
 671               gimple_set_uid (phi, 0);
 672               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 673             }
 674
 675           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 676             {
 677               gimple stmt = gsi_stmt (si);
 678               gimple_set_uid (stmt, 0);
 679               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 680             }
 681         }
 682     }
 683
 684   /* CHECKME: We want to visit all BBs before their successors (except for
 685      latch blocks, for which this assertion wouldn't hold).  In the simple
 686      case of the loop forms we allow, a dfs order of the BBs would the same
 687      as reversed postorder traversal, so we are safe.  */
 688
 689    free (bbs);
 690    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 691    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 692                               bbs, loop->num_nodes, loop);
 693    gcc_assert (nbbs == loop->num_nodes);
 694
 695   LOOP_VINFO_BBS (res) = bbs;
 696   LOOP_VINFO_NITERS (res) = NULL;
 697   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 698   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 699   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 700   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
 701   LOOP_VINFO_VECT_FACTOR (res) = 0;
 702   LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
 703   LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
 704   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 705   LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
 706     VEC_alloc (gimple, heap,
 707                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 708   LOOP_VINFO_MAY_ALIAS_DDRS (res) =
 709     VEC_alloc (ddr_p, heap,
 710                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 711   LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
 712   LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
 713   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 714
 715   return res;
 716 }
 717
 718
 719 /* Function destroy_loop_vec_info.
 720
 721    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 722    stmts in the loop.  */
 723
 724 void
 725 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 726 {
 727   struct loop *loop;
 728   basic_block *bbs;
 729   int nbbs;
 730   gimple_stmt_iterator si;
 731   int j;
 732   VEC (slp_instance, heap) *slp_instances;
 733   slp_instance instance;
 734
 735   if (!loop_vinfo)
 736     return;
 737
 738   loop = LOOP_VINFO_LOOP (loop_vinfo);
 739
 740   bbs = LOOP_VINFO_BBS (loop_vinfo);
 741   nbbs = loop->num_nodes;
 742
 743   if (!clean_stmts)
 744     {
 745       free (LOOP_VINFO_BBS (loop_vinfo));
 746       free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 747       free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 748       VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 749
 750       free (loop_vinfo);
 751       loop->aux = NULL;
 752       return;
 753     }
 754
 755   for (j = 0; j < nbbs; j++)
 756     {
 757       basic_block bb = bbs[j];
 758       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 759         free_stmt_vec_info (gsi_stmt (si));
 760
 761       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 762         {
 763           gimple stmt = gsi_stmt (si);
 764           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 765
 766           if (stmt_info)
 767             {
 768               /* Check if this is a "pattern stmt" (introduced by the
 769                  vectorizer during the pattern recognition pass).  */
 770               bool remove_stmt_p = false;
 771               gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
 772               if (orig_stmt)
 773                 {
 774                   stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
 775                   if (orig_stmt_info
 776                       && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
 777                     remove_stmt_p = true;
 778                 }
 779
 780               /* Free stmt_vec_info.  */
 781               free_stmt_vec_info (stmt);
 782
 783               /* Remove dead "pattern stmts".  */
 784               if (remove_stmt_p)
 785                 gsi_remove (&si, true);
 786             }
 787           gsi_next (&si);
 788         }
 789     }
 790
 791   free (LOOP_VINFO_BBS (loop_vinfo));
 792   free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 793   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 794   VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 795   VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
 796   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
 797   for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
 798     vect_free_slp_instance (instance);
 799
 800   VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
 801   VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
 802
 803   free (loop_vinfo);
 804   loop->aux = NULL;
 805 }
 806
 807
 808 /* Function vect_analyze_loop_1.
 809
 810    Apply a set of analyses on LOOP, and create a loop_vec_info struct
 811    for it. The different analyses will record information in the
 812    loop_vec_info struct.  This is a subset of the analyses applied in
 813    vect_analyze_loop, to be applied on an inner-loop nested in the loop
 814    that is now considered for (outer-loop) vectorization.  */
 815
 816 static loop_vec_info
 817 vect_analyze_loop_1 (struct loop *loop)
 818 {
 819   loop_vec_info loop_vinfo;
 820
 821   if (vect_print_dump_info (REPORT_DETAILS))
 822     fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
 823
 824   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
 825
 826   loop_vinfo = vect_analyze_loop_form (loop);
 827   if (!loop_vinfo)
 828     {
 829       if (vect_print_dump_info (REPORT_DETAILS))
 830         fprintf (vect_dump, "bad inner-loop form.");
 831       return NULL;
 832     }
 833
 834   return loop_vinfo;
 835 }
 836
 837
 838 /* Function vect_analyze_loop_form.
 839
 840    Verify that certain CFG restrictions hold, including:
 841    - the loop has a pre-header
 842    - the loop has a single entry and exit
 843    - the loop exit condition is simple enough, and the number of iterations
 844      can be analyzed (a countable loop).  */
 845
 846 loop_vec_info
 847 vect_analyze_loop_form (struct loop *loop)
 848 {
 849   loop_vec_info loop_vinfo;
 850   gimple loop_cond;
 851   tree number_of_iterations = NULL;
 852   loop_vec_info inner_loop_vinfo = NULL;
 853
 854   if (vect_print_dump_info (REPORT_DETAILS))
 855     fprintf (vect_dump, "=== vect_analyze_loop_form ===");
 856
 857   /* Different restrictions apply when we are considering an inner-most loop,
 858      vs. an outer (nested) loop.
 859      (FORNOW. May want to relax some of these restrictions in the future).  */
 860
 861   if (!loop->inner)
 862     {
 863       /* Inner-most loop.  We currently require that the number of BBs is
 864          exactly 2 (the header and latch).  Vectorizable inner-most loops
 865          look like this:
 866
 867                         (pre-header)
 868                            |
 869                           header <--------+
 870                            | |            |
 871                            | +--> latch --+
 872                            |
 873                         (exit-bb)  */
 874
 875       if (loop->num_nodes != 2)
 876         {
 877           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 878             fprintf (vect_dump, "not vectorized: control flow in loop.");
 879           return NULL;
 880         }
 881
 882       if (empty_block_p (loop->header))
 883     {
 884           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 885             fprintf (vect_dump, "not vectorized: empty loop.");
 886       return NULL;
 887     }
 888     }
 889   else
 890     {
 891       struct loop *innerloop = loop->inner;
 892       edge backedge, entryedge;
 893
 894       /* Nested loop. We currently require that the loop is doubly-nested,
 895          contains a single inner loop, and the number of BBs is exactly 5.
 896          Vectorizable outer-loops look like this:
 897
 898                         (pre-header)
 899                            |
 900                           header <---+
 901                            |         |
 902                           inner-loop |
 903                            |         |
 904                           tail ------+
 905                            |
 906                         (exit-bb)
 907
 908          The inner-loop has the properties expected of inner-most loops
 909          as described above.  */
 910
 911       if ((loop->inner)->inner || (loop->inner)->next)
 912         {
 913           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 914             fprintf (vect_dump, "not vectorized: multiple nested loops.");
 915           return NULL;
 916         }
 917
 918       /* Analyze the inner-loop.  */
 919       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
 920       if (!inner_loop_vinfo)
 921         {
 922           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 923             fprintf (vect_dump, "not vectorized: Bad inner loop.");
 924           return NULL;
 925         }
 926
 927       if (!expr_invariant_in_loop_p (loop,
 928                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
 929         {
 930           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 931             fprintf (vect_dump,
 932                      "not vectorized: inner-loop count not invariant.");
 933           destroy_loop_vec_info (inner_loop_vinfo, true);
 934           return NULL;
 935         }
 936
 937       if (loop->num_nodes != 5)
 938         {
 939           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 940             fprintf (vect_dump, "not vectorized: control flow in loop.");
 941           destroy_loop_vec_info (inner_loop_vinfo, true);
 942           return NULL;
 943         }
 944
 945       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
 946       backedge = EDGE_PRED (innerloop->header, 1);
 947       entryedge = EDGE_PRED (innerloop->header, 0);
 948       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
 949         {
 950           backedge = EDGE_PRED (innerloop->header, 0);
 951           entryedge = EDGE_PRED (innerloop->header, 1);
 952         }
 953
 954       if (entryedge->src != loop->header
 955           || !single_exit (innerloop)
 956           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
 957         {
 958           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 959             fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
 960           destroy_loop_vec_info (inner_loop_vinfo, true);
 961           return NULL;
 962         }
 963
 964       if (vect_print_dump_info (REPORT_DETAILS))
 965         fprintf (vect_dump, "Considering outer-loop vectorization.");
 966     }
 967
 968   if (!single_exit (loop)
 969       || EDGE_COUNT (loop->header->preds) != 2)
 970     {
 971       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 972         {
 973           if (!single_exit (loop))
 974             fprintf (vect_dump, "not vectorized: multiple exits.");
 975           else if (EDGE_COUNT (loop->header->preds) != 2)
 976             fprintf (vect_dump, "not vectorized: too many incoming edges.");
 977         }
 978       if (inner_loop_vinfo)
 979         destroy_loop_vec_info (inner_loop_vinfo, true);
 980       return NULL;
 981     }
 982
 983   /* We assume that the loop exit condition is at the end of the loop. i.e,
 984      that the loop is represented as a do-while (with a proper if-guard
 985      before the loop if needed), where the loop header contains all the
 986      executable statements, and the latch is empty.  */
 987   if (!empty_block_p (loop->latch)
 988         || phi_nodes (loop->latch))
 989     {
 990       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 991         fprintf (vect_dump, "not vectorized: unexpected loop form.");
 992       if (inner_loop_vinfo)
 993         destroy_loop_vec_info (inner_loop_vinfo, true);
 994       return NULL;
 995     }
 996
 997   /* Make sure there exists a single-predecessor exit bb:  */
 998   if (!single_pred_p (single_exit (loop)->dest))
 999     {
1000       edge e = single_exit (loop);
1001       if (!(e->flags & EDGE_ABNORMAL))
1002         {
1003           split_loop_exit_edge (e);
1004           if (vect_print_dump_info (REPORT_DETAILS))
1005             fprintf (vect_dump, "split exit edge.");
1006         }
1007       else
1008         {
1009           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1010             fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1011           if (inner_loop_vinfo)
1012             destroy_loop_vec_info (inner_loop_vinfo, true);
1013           return NULL;
1014         }
1015     }
1016
1017   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1018   if (!loop_cond)
1019     {
1020       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1021         fprintf (vect_dump, "not vectorized: complicated exit condition.");
1022       if (inner_loop_vinfo)
1023         destroy_loop_vec_info (inner_loop_vinfo, true);
1024       return NULL;
1025     }
1026
1027   if (!number_of_iterations)
1028     {
1029       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1030         fprintf (vect_dump,
1031                  "not vectorized: number of iterations cannot be computed.");
1032       if (inner_loop_vinfo)
1033         destroy_loop_vec_info (inner_loop_vinfo, true);
1034       return NULL;
1035     }
1036
1037   if (chrec_contains_undetermined (number_of_iterations))
1038     {
1039       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1040         fprintf (vect_dump, "Infinite number of iterations.");
1041       if (inner_loop_vinfo)
1042         destroy_loop_vec_info (inner_loop_vinfo, true);
1043       return NULL;
1044     }
1045
1046   if (!NITERS_KNOWN_P (number_of_iterations))
1047     {
1048       if (vect_print_dump_info (REPORT_DETAILS))
1049         {
1050           fprintf (vect_dump, "Symbolic number of iterations is ");
1051           print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1052         }
1053     }
1054   else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1055     {
1056       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1057         fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1058       if (inner_loop_vinfo)
1059         destroy_loop_vec_info (inner_loop_vinfo, false);
1060       return NULL;
1061     }
1062
1063   loop_vinfo = new_loop_vec_info (loop);
1064   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1065   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1066
1067   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1068
1069   /* CHECKME: May want to keep it around it in the future.  */
1070   if (inner_loop_vinfo)
1071     destroy_loop_vec_info (inner_loop_vinfo, false);
1072
1073   gcc_assert (!loop->aux);
1074   loop->aux = loop_vinfo;
1075   return loop_vinfo;
1076 }
1077
1078
1079 /* Function vect_analyze_loop_operations.
1080
1081    Scan the loop stmts and make sure they are all vectorizable.  */
1082
1083 static bool
1084 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1085 {
1086   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1087   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1088   int nbbs = loop->num_nodes;
1089   gimple_stmt_iterator si;
1090   unsigned int vectorization_factor = 0;
1091   int i;
1092   gimple phi;
1093   stmt_vec_info stmt_info;
1094   bool need_to_vectorize = false;
1095   int min_profitable_iters;
1096   int min_scalar_loop_bound;
1097   unsigned int th;
1098   bool only_slp_in_loop = true, ok;
1099
1100   if (vect_print_dump_info (REPORT_DETAILS))
1101     fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1102
1103   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1104   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1105
1106   for (i = 0; i < nbbs; i++)
1107     {
1108       basic_block bb = bbs[i];
1109
1110       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1111         {
1112           phi = gsi_stmt (si);
1113           ok = true;
1114
1115           stmt_info = vinfo_for_stmt (phi);
1116           if (vect_print_dump_info (REPORT_DETAILS))
1117             {
1118               fprintf (vect_dump, "examining phi: ");
1119               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1120             }
1121
1122           if (! is_loop_header_bb_p (bb))
1123             {
1124               /* inner-loop loop-closed exit phi in outer-loop vectorization
1125                  (i.e. a phi in the tail of the outer-loop).
1126                  FORNOW: we currently don't support the case that these phis
1127                  are not used in the outerloop (unless it is double reduction,
1128                  i.e., this phi is vect_reduction_def), cause this case
1129                  requires to actually do something here.  */
1130               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1131                    || STMT_VINFO_LIVE_P (stmt_info))
1132                   && STMT_VINFO_DEF_TYPE (stmt_info)
1133                      != vect_double_reduction_def)
1134                 {
1135                   if (vect_print_dump_info (REPORT_DETAILS))
1136                     fprintf (vect_dump,
1137                              "Unsupported loop-closed phi in outer-loop.");
1138                   return false;
1139                 }
1140               continue;
1141             }
1142
1143           gcc_assert (stmt_info);
1144
1145           if (STMT_VINFO_LIVE_P (stmt_info))
1146             {
1147               /* FORNOW: not yet supported.  */
1148               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1149                 fprintf (vect_dump, "not vectorized: value used after loop.");
1150               return false;
1151             }
1152
1153           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1154               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1155             {
1156               /* A scalar-dependence cycle that we don't support.  */
1157               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1158                 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1159               return false;
1160             }
1161
1162           if (STMT_VINFO_RELEVANT_P (stmt_info))
1163             {
1164               need_to_vectorize = true;
1165               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1166                 ok = vectorizable_induction (phi, NULL, NULL);
1167             }
1168
1169           if (!ok)
1170             {
1171               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1172                 {
1173                   fprintf (vect_dump,
1174                            "not vectorized: relevant phi not supported: ");
1175                   print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1176                 }
1177               return false;
1178             }
1179         }
1180
1181       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1182         {
1183           gimple stmt = gsi_stmt (si);
1184           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1185
1186           gcc_assert (stmt_info);
1187
1188           if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1189             return false;
1190
1191           if (STMT_VINFO_RELEVANT_P (stmt_info) && !PURE_SLP_STMT (stmt_info))
1192             /* STMT needs both SLP and loop-based vectorization.  */
1193             only_slp_in_loop = false;
1194         }
1195     } /* bbs */
1196
1197   /* All operations in the loop are either irrelevant (deal with loop
1198      control, or dead), or only used outside the loop and can be moved
1199      out of the loop (e.g. invariants, inductions).  The loop can be
1200      optimized away by scalar optimizations.  We're better off not
1201      touching this loop.  */
1202   if (!need_to_vectorize)
1203     {
1204       if (vect_print_dump_info (REPORT_DETAILS))
1205         fprintf (vect_dump,
1206                  "All the computation can be taken out of the loop.");
1207       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1208         fprintf (vect_dump,
1209                  "not vectorized: redundant loop. no profit to vectorize.");
1210       return false;
1211     }
1212
1213   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1214      vectorization factor of the loop is the unrolling factor required by the
1215      SLP instances.  If that unrolling factor is 1, we say, that we perform
1216      pure SLP on loop - cross iteration parallelism is not exploited.  */
1217   if (only_slp_in_loop)
1218     vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1219   else
1220     vectorization_factor = least_common_multiple (vectorization_factor,
1221                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1222
1223   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1224
1225   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1226       && vect_print_dump_info (REPORT_DETAILS))
1227     fprintf (vect_dump,
1228         "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1229         vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1230
1231   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1232       && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1233     {
1234       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1235         fprintf (vect_dump, "not vectorized: iteration count too small.");
1236       if (vect_print_dump_info (REPORT_DETAILS))
1237         fprintf (vect_dump,"not vectorized: iteration count smaller than "
1238                  "vectorization factor.");
1239       return false;
1240     }
1241
1242   /* Analyze cost. Decide if worth while to vectorize.  */
1243
1244   /* Once VF is set, SLP costs should be updated since the number of created
1245      vector stmts depends on VF.  */
1246   vect_update_slp_costs_according_to_vf (loop_vinfo);
1247
1248   min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1249   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1250
1251   if (min_profitable_iters < 0)
1252     {
1253       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1254         fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1255       if (vect_print_dump_info (REPORT_DETAILS))
1256         fprintf (vect_dump, "not vectorized: vector version will never be "
1257                  "profitable.");
1258       return false;
1259     }
1260
1261   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1262                             * vectorization_factor) - 1);
1263
1264   /* Use the cost model only if it is more conservative than user specified
1265      threshold.  */
1266
1267   th = (unsigned) min_scalar_loop_bound;
1268   if (min_profitable_iters
1269       && (!min_scalar_loop_bound
1270           || min_profitable_iters > min_scalar_loop_bound))
1271     th = (unsigned) min_profitable_iters;
1272
1273   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1274       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1275     {
1276       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1277         fprintf (vect_dump, "not vectorized: vectorization not "
1278                  "profitable.");
1279       if (vect_print_dump_info (REPORT_DETAILS))
1280         fprintf (vect_dump, "not vectorized: iteration count smaller than "
1281                  "user specified loop bound parameter or minimum "
1282                  "profitable iterations (whichever is more conservative).");
1283       return false;
1284     }
1285
1286   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1287       || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1288       || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1289     {
1290       if (vect_print_dump_info (REPORT_DETAILS))
1291         fprintf (vect_dump, "epilog loop required.");
1292       if (!vect_can_advance_ivs_p (loop_vinfo))
1293         {
1294           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1295             fprintf (vect_dump,
1296                      "not vectorized: can't create epilog loop 1.");
1297           return false;
1298         }
1299       if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1300         {
1301           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1302             fprintf (vect_dump,
1303                      "not vectorized: can't create epilog loop 2.");
1304           return false;
1305         }
1306     }
1307
1308   return true;
1309 }
1310
1311
1312 /* Function vect_analyze_loop.
1313
1314    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1315    for it. The different analyses will record information in the
1316    loop_vec_info struct.  */
1317 loop_vec_info
1318 vect_analyze_loop (struct loop *loop)
1319 {
1320   bool ok;
1321   loop_vec_info loop_vinfo;
1322
1323   if (vect_print_dump_info (REPORT_DETAILS))
1324     fprintf (vect_dump, "===== analyze_loop_nest =====");
1325
1326   if (loop_outer (loop)
1327       && loop_vec_info_for_loop (loop_outer (loop))
1328       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1329     {
1330       if (vect_print_dump_info (REPORT_DETAILS))
1331         fprintf (vect_dump, "outer-loop already vectorized.");
1332       return NULL;
1333     }
1334
1335   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1336
1337   loop_vinfo = vect_analyze_loop_form (loop);
1338   if (!loop_vinfo)
1339     {
1340       if (vect_print_dump_info (REPORT_DETAILS))
1341         fprintf (vect_dump, "bad loop form.");
1342       return NULL;
1343     }
1344
1345   /* Find all data references in the loop (which correspond to vdefs/vuses)
1346      and analyze their evolution in the loop.
1347
1348      FORNOW: Handle only simple, array references, which
1349      alignment can be forced, and aligned pointer-references.  */
1350
1351   ok = vect_analyze_data_refs (loop_vinfo, NULL);
1352   if (!ok)
1353     {
1354       if (vect_print_dump_info (REPORT_DETAILS))
1355         fprintf (vect_dump, "bad data references.");
1356       destroy_loop_vec_info (loop_vinfo, true);
1357       return NULL;
1358     }
1359
1360   /* Classify all cross-iteration scalar data-flow cycles.
1361      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1362
1363   vect_analyze_scalar_cycles (loop_vinfo);
1364
1365   vect_pattern_recog (loop_vinfo);
1366
1367   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1368
1369   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1370   if (!ok)
1371     {
1372       if (vect_print_dump_info (REPORT_DETAILS))
1373         fprintf (vect_dump, "unexpected pattern.");
1374       destroy_loop_vec_info (loop_vinfo, true);
1375       return NULL;
1376     }
1377
1378   /* Analyze the alignment of the data-refs in the loop.
1379      Fail if a data reference is found that cannot be vectorized.  */
1380
1381   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1382   if (!ok)
1383     {
1384       if (vect_print_dump_info (REPORT_DETAILS))
1385         fprintf (vect_dump, "bad data alignment.");
1386       destroy_loop_vec_info (loop_vinfo, true);
1387       return NULL;
1388     }
1389
1390   ok = vect_determine_vectorization_factor (loop_vinfo);
1391   if (!ok)
1392     {
1393       if (vect_print_dump_info (REPORT_DETAILS))
1394         fprintf (vect_dump, "can't determine vectorization factor.");
1395       destroy_loop_vec_info (loop_vinfo, true);
1396       return NULL;
1397     }
1398
1399   /* Analyze data dependences between the data-refs in the loop.
1400      FORNOW: fail at the first data dependence that we encounter.  */
1401
1402   ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL);
1403   if (!ok)
1404     {
1405       if (vect_print_dump_info (REPORT_DETAILS))
1406         fprintf (vect_dump, "bad data dependence.");
1407       destroy_loop_vec_info (loop_vinfo, true);
1408       return NULL;
1409     }
1410
1411   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1412      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1413
1414   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1415   if (!ok)
1416     {
1417       if (vect_print_dump_info (REPORT_DETAILS))
1418         fprintf (vect_dump, "bad data access.");
1419       destroy_loop_vec_info (loop_vinfo, true);
1420       return NULL;
1421     }
1422
1423   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1424      It is important to call pruning after vect_analyze_data_ref_accesses,
1425      since we use grouping information gathered by interleaving analysis.  */
1426   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1427   if (!ok)
1428     {
1429       if (vect_print_dump_info (REPORT_DETAILS))
1430         fprintf (vect_dump, "too long list of versioning for alias "
1431                             "run-time tests.");
1432       destroy_loop_vec_info (loop_vinfo, true);
1433       return NULL;
1434     }
1435
1436   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1437   ok = vect_analyze_slp (loop_vinfo, NULL);
1438   if (ok)
1439     {
1440       /* Decide which possible SLP instances to SLP.  */
1441       vect_make_slp_decision (loop_vinfo);
1442
1443       /* Find stmts that need to be both vectorized and SLPed.  */
1444       vect_detect_hybrid_slp (loop_vinfo);
1445     }
1446
1447   /* This pass will decide on using loop versioning and/or loop peeling in
1448      order to enhance the alignment of data references in the loop.  */
1449
1450   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1451   if (!ok)
1452     {
1453       if (vect_print_dump_info (REPORT_DETAILS))
1454         fprintf (vect_dump, "bad data alignment.");
1455       destroy_loop_vec_info (loop_vinfo, true);
1456       return NULL;
1457     }
1458
1459   /* Scan all the operations in the loop and make sure they are
1460      vectorizable.  */
1461
1462   ok = vect_analyze_loop_operations (loop_vinfo);
1463   if (!ok)
1464     {
1465       if (vect_print_dump_info (REPORT_DETAILS))
1466         fprintf (vect_dump, "bad operation or unsupported loop bound.");
1467       destroy_loop_vec_info (loop_vinfo, true);
1468       return NULL;
1469     }
1470
1471   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1472
1473   return loop_vinfo;
1474 }
1475
1476
1477 /* Function reduction_code_for_scalar_code
1478
1479    Input:
1480    CODE - tree_code of a reduction operations.
1481
1482    Output:
1483    REDUC_CODE - the corresponding tree-code to be used to reduce the
1484       vector of partial results into a single scalar result (which
1485       will also reside in a vector) or ERROR_MARK if the operation is
1486       a supported reduction operation, but does not have such tree-code.
1487
1488    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1489
1490 static bool
1491 reduction_code_for_scalar_code (enum tree_code code,
1492                                 enum tree_code *reduc_code)
1493 {
1494   switch (code)
1495     {
1496       case MAX_EXPR:
1497         *reduc_code = REDUC_MAX_EXPR;
1498         return true;
1499
1500       case MIN_EXPR:
1501         *reduc_code = REDUC_MIN_EXPR;
1502         return true;
1503
1504       case PLUS_EXPR:
1505         *reduc_code = REDUC_PLUS_EXPR;
1506         return true;
1507
1508       case MULT_EXPR:
1509       case MINUS_EXPR:
1510       case BIT_IOR_EXPR:
1511       case BIT_XOR_EXPR:
1512       case BIT_AND_EXPR:
1513         *reduc_code = ERROR_MARK;
1514         return true;
1515
1516       default:
1517        return false;
1518     }
1519 }
1520
1521
1522 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1523    STMT is printed with a message MSG. */
1524
1525 static void
1526 report_vect_op (gimple stmt, const char *msg)
1527 {
1528   fprintf (vect_dump, "%s", msg);
1529   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1530 }
1531
1532
1533 /* Function vect_is_simple_reduction
1534
1535    (1) Detect a cross-iteration def-use cycle that represents a simple
1536    reduction computation. We look for the following pattern:
1537
1538    loop_header:
1539      a1 = phi < a0, a2 >
1540      a3 = ...
1541      a2 = operation (a3, a1)
1542
1543    such that:
1544    1. operation is commutative and associative and it is safe to
1545       change the order of the computation (if CHECK_REDUCTION is true)
1546    2. no uses for a2 in the loop (a2 is used out of the loop)
1547    3. no uses of a1 in the loop besides the reduction operation.
1548
1549    Condition 1 is tested here.
1550    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1551
1552    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1553    nested cycles, if CHECK_REDUCTION is false.
1554
1555    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1556    reductions:
1557
1558      a1 = phi < a0, a2 >
1559      inner loop (def of a3)
1560      a2 = phi < a3 >
1561 */
1562
1563 gimple
1564 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
1565                           bool check_reduction, bool *double_reduc)
1566 {
1567   struct loop *loop = (gimple_bb (phi))->loop_father;
1568   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1569   edge latch_e = loop_latch_edge (loop);
1570   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1571   gimple def_stmt, def1 = NULL, def2 = NULL;
1572   enum tree_code code;
1573   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1574   tree type;
1575   int nloop_uses;
1576   tree name;
1577   imm_use_iterator imm_iter;
1578   use_operand_p use_p;
1579   bool phi_def;
1580
1581   *double_reduc = false;
1582
1583   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1584      otherwise, we assume outer loop vectorization.  */
1585   gcc_assert ((check_reduction && loop == vect_loop)
1586               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1587
1588   name = PHI_RESULT (phi);
1589   nloop_uses = 0;
1590   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1591     {
1592       gimple use_stmt = USE_STMT (use_p);
1593       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1594           && vinfo_for_stmt (use_stmt)
1595           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1596         nloop_uses++;
1597       if (nloop_uses > 1)
1598         {
1599           if (vect_print_dump_info (REPORT_DETAILS))
1600             fprintf (vect_dump, "reduction used in loop.");
1601           return NULL;
1602         }
1603     }
1604
1605   if (TREE_CODE (loop_arg) != SSA_NAME)
1606     {
1607       if (vect_print_dump_info (REPORT_DETAILS))
1608         {
1609           fprintf (vect_dump, "reduction: not ssa_name: ");
1610           print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
1611         }
1612       return NULL;
1613     }
1614
1615   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
1616   if (!def_stmt)
1617     {
1618       if (vect_print_dump_info (REPORT_DETAILS))
1619         fprintf (vect_dump, "reduction: no def_stmt.");
1620       return NULL;
1621     }
1622
1623   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
1624     {
1625       if (vect_print_dump_info (REPORT_DETAILS))
1626         print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1627       return NULL;
1628     }
1629
1630   if (is_gimple_assign (def_stmt))
1631     {
1632       name = gimple_assign_lhs (def_stmt);
1633       phi_def = false;
1634     }
1635   else
1636     {
1637       name = PHI_RESULT (def_stmt);
1638       phi_def = true;
1639     }
1640
1641   nloop_uses = 0;
1642   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1643     {
1644       gimple use_stmt = USE_STMT (use_p);
1645       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1646           && vinfo_for_stmt (use_stmt)
1647           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1648         nloop_uses++;
1649       if (nloop_uses > 1)
1650         {
1651           if (vect_print_dump_info (REPORT_DETAILS))
1652             fprintf (vect_dump, "reduction used in loop.");
1653           return NULL;
1654         }
1655     }
1656
1657   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
1658      defined in the inner loop.  */
1659   if (phi_def)
1660     {
1661       op1 = PHI_ARG_DEF (def_stmt, 0);
1662
1663       if (gimple_phi_num_args (def_stmt) != 1
1664           || TREE_CODE (op1) != SSA_NAME)
1665         {
1666           if (vect_print_dump_info (REPORT_DETAILS))
1667             fprintf (vect_dump, "unsupported phi node definition.");
1668
1669           return NULL;
1670         }
1671
1672       def1 = SSA_NAME_DEF_STMT (op1);
1673       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1674           && loop->inner
1675           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
1676           && is_gimple_assign (def1))
1677         {
1678           if (vect_print_dump_info (REPORT_DETAILS))
1679             report_vect_op (def_stmt, "detected double reduction: ");
1680
1681           *double_reduc = true;
1682           return def_stmt;
1683         }
1684
1685       return NULL;
1686     }
1687
1688   code = gimple_assign_rhs_code (def_stmt);
1689
1690   if (check_reduction
1691       && (!commutative_tree_code (code) || !associative_tree_code (code)))
1692     {
1693       if (vect_print_dump_info (REPORT_DETAILS))
1694         report_vect_op (def_stmt, "reduction: not commutative/associative: ");
1695       return NULL;
1696     }
1697
1698   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
1699     {
1700       if (code != COND_EXPR)
1701         {
1702           if (vect_print_dump_info (REPORT_DETAILS))
1703             report_vect_op (def_stmt, "reduction: not binary operation: ");
1704
1705           return NULL;
1706         }
1707
1708       op3 = TREE_OPERAND (TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0), 0);
1709       op4 = TREE_OPERAND (TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0), 1);
1710       op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1);
1711       op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2);
1712
1713       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
1714         {
1715           if (vect_print_dump_info (REPORT_DETAILS))
1716             report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1717
1718           return NULL;
1719         }
1720     }
1721   else
1722     {
1723       op1 = gimple_assign_rhs1 (def_stmt);
1724       op2 = gimple_assign_rhs2 (def_stmt);
1725
1726       if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
1727         {
1728           if (vect_print_dump_info (REPORT_DETAILS))
1729             report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1730
1731           return NULL;
1732         }
1733    }
1734
1735   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
1736   if ((TREE_CODE (op1) == SSA_NAME
1737        && TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op1)))
1738       || (TREE_CODE (op2) == SSA_NAME
1739           && TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op2)))
1740       || (op3 && TREE_CODE (op3) == SSA_NAME
1741           && TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op3)))
1742       || (op4 && TREE_CODE (op4) == SSA_NAME
1743           && TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op4))))
1744     {
1745       if (vect_print_dump_info (REPORT_DETAILS))
1746         {
1747           fprintf (vect_dump, "reduction: multiple types: operation type: ");
1748           print_generic_expr (vect_dump, type, TDF_SLIM);
1749           fprintf (vect_dump, ", operands types: ");
1750           print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
1751           fprintf (vect_dump, ",");
1752           print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
1753           if (op3 && op4)
1754             {
1755               fprintf (vect_dump, ",");
1756               print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
1757               fprintf (vect_dump, ",");
1758               print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
1759             }
1760         }
1761
1762       return NULL;
1763     }
1764
1765   /* Check that it's ok to change the order of the computation.
1766      Generally, when vectorizing a reduction we change the order of the
1767      computation.  This may change the behavior of the program in some
1768      cases, so we need to check that this is ok.  One exception is when
1769      vectorizing an outer-loop: the inner-loop is executed sequentially,
1770      and therefore vectorizing reductions in the inner-loop during
1771      outer-loop vectorization is safe.  */
1772
1773   /* CHECKME: check for !flag_finite_math_only too?  */
1774   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
1775       && check_reduction)
1776     {
1777       /* Changing the order of operations changes the semantics.  */
1778       if (vect_print_dump_info (REPORT_DETAILS))
1779         report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
1780       return NULL;
1781     }
1782   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
1783            && check_reduction)
1784     {
1785       /* Changing the order of operations changes the semantics.  */
1786       if (vect_print_dump_info (REPORT_DETAILS))
1787         report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
1788       return NULL;
1789     }
1790   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
1791     {
1792       /* Changing the order of operations changes the semantics.  */
1793       if (vect_print_dump_info (REPORT_DETAILS))
1794         report_vect_op (def_stmt,
1795                         "reduction: unsafe fixed-point math optimization: ");
1796       return NULL;
1797     }
1798
1799   /* Reduction is safe. We're dealing with one of the following:
1800      1) integer arithmetic and no trapv
1801      2) floating point arithmetic, and special flags permit this optimization
1802      3) nested cycle (i.e., outer loop vectorization).  */
1803   if (TREE_CODE (op1) == SSA_NAME)
1804     def1 = SSA_NAME_DEF_STMT (op1);
1805
1806   if (TREE_CODE (op2) == SSA_NAME)
1807     def2 = SSA_NAME_DEF_STMT (op2);
1808
1809   if (code != COND_EXPR
1810       && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2)))
1811     {
1812       if (vect_print_dump_info (REPORT_DETAILS))
1813         report_vect_op (def_stmt, "reduction: no defs for operands: ");
1814       return NULL;
1815     }
1816
1817   /* Check that one def is the reduction def, defined by PHI,
1818      the other def is either defined in the loop ("vect_internal_def"),
1819      or it's an induction (defined by a loop-header phi-node).  */
1820
1821   if (def2 && def2 == phi
1822       && (code == COND_EXPR
1823           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
1824               && (is_gimple_assign (def1)
1825                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1826                       == vect_induction_def
1827                   || (gimple_code (def1) == GIMPLE_PHI
1828                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1829                           == vect_internal_def
1830                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
1831     {
1832       if (vect_print_dump_info (REPORT_DETAILS))
1833         report_vect_op (def_stmt, "detected reduction: ");
1834       return def_stmt;
1835     }
1836   else if (def1 && def1 == phi
1837            && (code == COND_EXPR
1838                || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
1839                    && (is_gimple_assign (def2)
1840                        || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1841                            == vect_induction_def
1842                        || (gimple_code (def2) == GIMPLE_PHI
1843                            && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1844                                == vect_internal_def
1845                            && !is_loop_header_bb_p (gimple_bb (def2)))))))
1846     {
1847       if (check_reduction)
1848         {
1849           /* Swap operands (just for simplicity - so that the rest of the code
1850              can assume that the reduction variable is always the last (second)
1851              argument).  */
1852           if (vect_print_dump_info (REPORT_DETAILS))
1853             report_vect_op (def_stmt,
1854                             "detected reduction: need to swap operands: ");
1855
1856           swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
1857                               gimple_assign_rhs2_ptr (def_stmt));
1858         }
1859       else
1860         {
1861           if (vect_print_dump_info (REPORT_DETAILS))
1862             report_vect_op (def_stmt, "detected reduction: ");
1863         }
1864
1865       return def_stmt;
1866     }
1867   else
1868     {
1869       if (vect_print_dump_info (REPORT_DETAILS))
1870         report_vect_op (def_stmt, "reduction: unknown pattern: ");
1871
1872       return NULL;
1873     }
1874 }
1875
1876
1877 /* Function vect_estimate_min_profitable_iters
1878
1879    Return the number of iterations required for the vector version of the
1880    loop to be profitable relative to the cost of the scalar version of the
1881    loop.
1882
1883    TODO: Take profile info into account before making vectorization
1884    decisions, if available.  */
1885
1886 int
1887 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
1888 {
1889   int i;
1890   int min_profitable_iters;
1891   int peel_iters_prologue;
1892   int peel_iters_epilogue;
1893   int vec_inside_cost = 0;
1894   int vec_outside_cost = 0;
1895   int scalar_single_iter_cost = 0;
1896   int scalar_outside_cost = 0;
1897   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1898   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1899   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1900   int nbbs = loop->num_nodes;
1901   int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
1902   int peel_guard_costs = 0;
1903   int innerloop_iters = 0, factor;
1904   VEC (slp_instance, heap) *slp_instances;
1905   slp_instance instance;
1906
1907   /* Cost model disabled.  */
1908   if (!flag_vect_cost_model)
1909     {
1910       if (vect_print_dump_info (REPORT_COST))
1911         fprintf (vect_dump, "cost model disabled.");
1912       return 0;
1913     }
1914
1915   /* Requires loop versioning tests to handle misalignment.  */
1916   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1917     {
1918       /*  FIXME: Make cost depend on complexity of individual check.  */
1919       vec_outside_cost +=
1920         VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
1921       if (vect_print_dump_info (REPORT_COST))
1922         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
1923                  "versioning to treat misalignment.\n");
1924     }
1925
1926   /* Requires loop versioning with alias checks.  */
1927   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
1928     {
1929       /*  FIXME: Make cost depend on complexity of individual check.  */
1930       vec_outside_cost +=
1931         VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
1932       if (vect_print_dump_info (REPORT_COST))
1933         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
1934                  "versioning aliasing.\n");
1935     }
1936
1937   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
1938       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
1939     vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
1940
1941   /* Count statements in scalar loop.  Using this as scalar cost for a single
1942      iteration for now.
1943
1944      TODO: Add outer loop support.
1945
1946      TODO: Consider assigning different costs to different scalar
1947      statements.  */
1948
1949   /* FORNOW.  */
1950   if (loop->inner)
1951     innerloop_iters = 50; /* FIXME */
1952
1953   for (i = 0; i < nbbs; i++)
1954     {
1955       gimple_stmt_iterator si;
1956       basic_block bb = bbs[i];
1957
1958       if (bb->loop_father == loop->inner)
1959         factor = innerloop_iters;
1960       else
1961         factor = 1;
1962
1963       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1964         {
1965           gimple stmt = gsi_stmt (si);
1966           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1967           /* Skip stmts that are not vectorized inside the loop.  */
1968           if (!STMT_VINFO_RELEVANT_P (stmt_info)
1969               && (!STMT_VINFO_LIVE_P (stmt_info)
1970                   || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
1971             continue;
1972           scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
1973           vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
1974           /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
1975              some of the "outside" costs are generated inside the outer-loop.  */
1976           vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
1977         }
1978     }
1979
1980   /* Add additional cost for the peeled instructions in prologue and epilogue
1981      loop.
1982
1983      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
1984      at compile-time - we assume it's vf/2 (the worst would be vf-1).
1985
1986      TODO: Build an expression that represents peel_iters for prologue and
1987      epilogue to be used in a run-time test.  */
1988
1989   if (byte_misalign < 0)
1990     {
1991       peel_iters_prologue = vf/2;
1992       if (vect_print_dump_info (REPORT_COST))
1993         fprintf (vect_dump, "cost model: "
1994                  "prologue peel iters set to vf/2.");
1995
1996       /* If peeling for alignment is unknown, loop bound of main loop becomes
1997          unknown.  */
1998       peel_iters_epilogue = vf/2;
1999       if (vect_print_dump_info (REPORT_COST))
2000         fprintf (vect_dump, "cost model: "
2001                  "epilogue peel iters set to vf/2 because "
2002                  "peeling for alignment is unknown .");
2003
2004       /* If peeled iterations are unknown, count a taken branch and a not taken
2005          branch per peeled loop. Even if scalar loop iterations are known,
2006          vector iterations are not known since peeled prologue iterations are
2007          not known. Hence guards remain the same.  */
2008       peel_guard_costs +=  2 * (TARG_COND_TAKEN_BRANCH_COST
2009                               + TARG_COND_NOT_TAKEN_BRANCH_COST);
2010     }
2011   else
2012     {
2013       if (byte_misalign)
2014         {
2015           struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2016           int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
2017           tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2018           int nelements = TYPE_VECTOR_SUBPARTS (vectype);
2019
2020           peel_iters_prologue = nelements - (byte_misalign / element_size);
2021         }
2022       else
2023         peel_iters_prologue = 0;
2024
2025       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2026         {
2027           peel_iters_epilogue = vf/2;
2028           if (vect_print_dump_info (REPORT_COST))
2029             fprintf (vect_dump, "cost model: "
2030                      "epilogue peel iters set to vf/2 because "
2031                      "loop iterations are unknown .");
2032
2033           /* If peeled iterations are known but number of scalar loop
2034              iterations are unknown, count a taken branch per peeled loop.  */
2035           peel_guard_costs +=  2 * TARG_COND_TAKEN_BRANCH_COST;
2036
2037         }
2038       else
2039         {
2040           int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2041           peel_iters_prologue = niters < peel_iters_prologue ?
2042                                         niters : peel_iters_prologue;
2043           peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2044         }
2045     }
2046
2047   vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2048                       + (peel_iters_epilogue * scalar_single_iter_cost)
2049                       + peel_guard_costs;
2050
2051   /* FORNOW: The scalar outside cost is incremented in one of the
2052      following ways:
2053
2054      1. The vectorizer checks for alignment and aliasing and generates
2055      a condition that allows dynamic vectorization.  A cost model
2056      check is ANDED with the versioning condition.  Hence scalar code
2057      path now has the added cost of the versioning check.
2058
2059        if (cost > th & versioning_check)
2060          jmp to vector code
2061
2062      Hence run-time scalar is incremented by not-taken branch cost.
2063
2064      2. The vectorizer then checks if a prologue is required.  If the
2065      cost model check was not done before during versioning, it has to
2066      be done before the prologue check.
2067
2068        if (cost <= th)
2069          prologue = scalar_iters
2070        if (prologue == 0)
2071          jmp to vector code
2072        else
2073          execute prologue
2074        if (prologue == num_iters)
2075          go to exit
2076
2077      Hence the run-time scalar cost is incremented by a taken branch,
2078      plus a not-taken branch, plus a taken branch cost.
2079
2080      3. The vectorizer then checks if an epilogue is required.  If the
2081      cost model check was not done before during prologue check, it
2082      has to be done with the epilogue check.
2083
2084        if (prologue == 0)
2085          jmp to vector code
2086        else
2087          execute prologue
2088        if (prologue == num_iters)
2089          go to exit
2090        vector code:
2091          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2092            jmp to epilogue
2093
2094      Hence the run-time scalar cost should be incremented by 2 taken
2095      branches.
2096
2097      TODO: The back end may reorder the BBS's differently and reverse
2098      conditions/branch directions.  Change the estimates below to
2099      something more reasonable.  */
2100
2101   /* If the number of iterations is known and we do not do versioning, we can
2102      decide whether to vectorize at compile time. Hence the scalar version
2103      do not carry cost model guard costs.  */
2104   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2105       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2106       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2107     {
2108       /* Cost model check occurs at versioning.  */
2109       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2110           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2111         scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
2112       else
2113         {
2114           /* Cost model check occurs at prologue generation.  */
2115           if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2116             scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
2117               + TARG_COND_NOT_TAKEN_BRANCH_COST;
2118           /* Cost model check occurs at epilogue generation.  */
2119           else
2120             scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
2121         }
2122     }
2123
2124   /* Add SLP costs.  */
2125   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2126   for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
2127     {
2128       vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2129       vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2130     }
2131
2132   /* Calculate number of iterations required to make the vector version
2133      profitable, relative to the loop bodies only. The following condition
2134      must hold true:
2135      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2136      where
2137      SIC = scalar iteration cost, VIC = vector iteration cost,
2138      VOC = vector outside cost, VF = vectorization factor,
2139      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2140      SOC = scalar outside cost for run time cost model check.  */
2141
2142   if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2143     {
2144       if (vec_outside_cost <= 0)
2145         min_profitable_iters = 1;
2146       else
2147         {
2148           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2149                                   - vec_inside_cost * peel_iters_prologue
2150                                   - vec_inside_cost * peel_iters_epilogue)
2151                                  / ((scalar_single_iter_cost * vf)
2152                                     - vec_inside_cost);
2153
2154           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2155               <= ((vec_inside_cost * min_profitable_iters)
2156                   + ((vec_outside_cost - scalar_outside_cost) * vf)))
2157             min_profitable_iters++;
2158         }
2159     }
2160   /* vector version will never be profitable.  */
2161   else
2162     {
2163       if (vect_print_dump_info (REPORT_COST))
2164         fprintf (vect_dump, "cost model: vector iteration cost = %d "
2165                  "is divisible by scalar iteration cost = %d by a factor "
2166                  "greater than or equal to the vectorization factor = %d .",
2167                  vec_inside_cost, scalar_single_iter_cost, vf);
2168       return -1;
2169     }
2170
2171   if (vect_print_dump_info (REPORT_COST))
2172     {
2173       fprintf (vect_dump, "Cost model analysis: \n");
2174       fprintf (vect_dump, "  Vector inside of loop cost: %d\n",
2175                vec_inside_cost);
2176       fprintf (vect_dump, "  Vector outside of loop cost: %d\n",
2177                vec_outside_cost);
2178       fprintf (vect_dump, "  Scalar iteration cost: %d\n",
2179                scalar_single_iter_cost);
2180       fprintf (vect_dump, "  Scalar outside cost: %d\n", scalar_outside_cost);
2181       fprintf (vect_dump, "  prologue iterations: %d\n",
2182                peel_iters_prologue);
2183       fprintf (vect_dump, "  epilogue iterations: %d\n",
2184                peel_iters_epilogue);
2185       fprintf (vect_dump, "  Calculated minimum iters for profitability: %d\n",
2186                min_profitable_iters);
2187     }
2188
2189   min_profitable_iters =
2190         min_profitable_iters < vf ? vf : min_profitable_iters;
2191
2192   /* Because the condition we create is:
2193      if (niters <= min_profitable_iters)
2194        then skip the vectorized loop.  */
2195   min_profitable_iters--;
2196
2197   if (vect_print_dump_info (REPORT_COST))
2198     fprintf (vect_dump, "  Profitability threshold = %d\n",
2199              min_profitable_iters);
2200
2201   return min_profitable_iters;
2202 }
2203
2204
2205 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2206    functions. Design better to avoid maintenance issues.  */
2207
2208 /* Function vect_model_reduction_cost.
2209
2210    Models cost for a reduction operation, including the vector ops
2211    generated within the strip-mine loop, the initial definition before
2212    the loop, and the epilogue code that must be generated.  */
2213
2214 static bool
2215 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2216                            int ncopies)
2217 {
2218   int outer_cost = 0;
2219   enum tree_code code;
2220   optab optab;
2221   tree vectype;
2222   gimple stmt, orig_stmt;
2223   tree reduction_op;
2224   enum machine_mode mode;
2225   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2226   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2227
2228
2229   /* Cost of reduction op inside loop.  */
2230   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
2231
2232   stmt = STMT_VINFO_STMT (stmt_info);
2233
2234   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2235     {
2236     case GIMPLE_SINGLE_RHS:
2237       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2238       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2239       break;
2240     case GIMPLE_UNARY_RHS:
2241       reduction_op = gimple_assign_rhs1 (stmt);
2242       break;
2243     case GIMPLE_BINARY_RHS:
2244       reduction_op = gimple_assign_rhs2 (stmt);
2245       break;
2246     default:
2247       gcc_unreachable ();
2248     }
2249
2250   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2251   if (!vectype)
2252     {
2253       if (vect_print_dump_info (REPORT_COST))
2254         {
2255           fprintf (vect_dump, "unsupported data-type ");
2256           print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2257         }
2258       return false;
2259    }
2260
2261   mode = TYPE_MODE (vectype);
2262   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2263
2264   if (!orig_stmt)
2265     orig_stmt = STMT_VINFO_STMT (stmt_info);
2266
2267   code = gimple_assign_rhs_code (orig_stmt);
2268
2269   /* Add in cost for initial definition.  */
2270   outer_cost += TARG_SCALAR_TO_VEC_COST;
2271
2272   /* Determine cost of epilogue code.
2273
2274      We have a reduction operator that will reduce the vector in one statement.
2275      Also requires scalar extract.  */
2276
2277   if (!nested_in_vect_loop_p (loop, orig_stmt))
2278     {
2279       if (reduc_code != ERROR_MARK)
2280         outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
2281       else
2282         {
2283           int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2284           tree bitsize =
2285             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2286           int element_bitsize = tree_low_cst (bitsize, 1);
2287           int nelements = vec_size_in_bits / element_bitsize;
2288
2289           optab = optab_for_tree_code (code, vectype, optab_default);
2290
2291           /* We have a whole vector shift available.  */
2292           if (VECTOR_MODE_P (mode)
2293               && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
2294               && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2295             /* Final reduction via vector shifts and the reduction operator. Also
2296                requires scalar extract.  */
2297             outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
2298                                 + TARG_VEC_TO_SCALAR_COST);
2299           else
2300             /* Use extracts and reduction op for final reduction.  For N elements,
2301                we have N extracts and N-1 reduction ops.  */
2302             outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
2303         }
2304     }
2305
2306   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2307
2308   if (vect_print_dump_info (REPORT_COST))
2309     fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2310              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2311              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2312
2313   return true;
2314 }
2315
2316
2317 /* Function vect_model_induction_cost.
2318
2319    Models cost for induction operations.  */
2320
2321 static void
2322 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2323 {
2324   /* loop cost for vec_loop.  */
2325   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
2326   /* prologue cost for vec_init and vec_step.  */
2327   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
2328
2329   if (vect_print_dump_info (REPORT_COST))
2330     fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2331              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2332              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2333 }
2334
2335
2336 /* Function get_initial_def_for_induction
2337
2338    Input:
2339    STMT - a stmt that performs an induction operation in the loop.
2340    IV_PHI - the initial value of the induction variable
2341
2342    Output:
2343    Return a vector variable, initialized with the first VF values of
2344    the induction variable. E.g., for an iv with IV_PHI='X' and
2345    evolution S, for a vector of 4 units, we want to return:
2346    [X, X + S, X + 2*S, X + 3*S].  */
2347
2348 static tree
2349 get_initial_def_for_induction (gimple iv_phi)
2350 {
2351   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2352   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2353   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2354   tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
2355   tree vectype;
2356   int nunits;
2357   edge pe = loop_preheader_edge (loop);
2358   struct loop *iv_loop;
2359   basic_block new_bb;
2360   tree vec, vec_init, vec_step, t;
2361   tree access_fn;
2362   tree new_var;
2363   tree new_name;
2364   gimple init_stmt, induction_phi, new_stmt;
2365   tree induc_def, vec_def, vec_dest;
2366   tree init_expr, step_expr;
2367   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2368   int i;
2369   bool ok;
2370   int ncopies;
2371   tree expr;
2372   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2373   bool nested_in_vect_loop = false;
2374   gimple_seq stmts = NULL;
2375   imm_use_iterator imm_iter;
2376   use_operand_p use_p;
2377   gimple exit_phi;
2378   edge latch_e;
2379   tree loop_arg;
2380   gimple_stmt_iterator si;
2381   basic_block bb = gimple_bb (iv_phi);
2382   tree stepvectype;
2383
2384   vectype = get_vectype_for_scalar_type (scalar_type);
2385   gcc_assert (vectype);
2386   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2387   ncopies = vf / nunits;
2388
2389   gcc_assert (phi_info);
2390   gcc_assert (ncopies >= 1);
2391
2392   /* Find the first insertion point in the BB.  */
2393   si = gsi_after_labels (bb);
2394
2395   if (INTEGRAL_TYPE_P (scalar_type))
2396     step_expr = build_int_cst (scalar_type, 0);
2397   else if (POINTER_TYPE_P (scalar_type))
2398     step_expr = build_int_cst (sizetype, 0);
2399   else
2400     step_expr = build_real (scalar_type, dconst0);
2401
2402   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
2403   if (nested_in_vect_loop_p (loop, iv_phi))
2404     {
2405       nested_in_vect_loop = true;
2406       iv_loop = loop->inner;
2407     }
2408   else
2409     iv_loop = loop;
2410   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2411
2412   latch_e = loop_latch_edge (iv_loop);
2413   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2414
2415   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2416   gcc_assert (access_fn);
2417   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2418                                     &init_expr, &step_expr);
2419   gcc_assert (ok);
2420   pe = loop_preheader_edge (iv_loop);
2421
2422   /* Create the vector that holds the initial_value of the induction.  */
2423   if (nested_in_vect_loop)
2424     {
2425       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
2426          been created during vectorization of previous stmts; We obtain it from
2427          the STMT_VINFO_VEC_STMT of the defining stmt. */
2428       tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2429                                            loop_preheader_edge (iv_loop));
2430       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2431     }
2432   else
2433     {
2434       /* iv_loop is the loop to be vectorized. Create:
2435          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
2436       new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2437       add_referenced_var (new_var);
2438
2439       new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2440       if (stmts)
2441         {
2442           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2443           gcc_assert (!new_bb);
2444         }
2445
2446       t = NULL_TREE;
2447       t = tree_cons (NULL_TREE, init_expr, t);
2448       for (i = 1; i < nunits; i++)
2449         {
2450           /* Create: new_name_i = new_name + step_expr  */
2451           enum tree_code code = POINTER_TYPE_P (scalar_type)
2452                                 ? POINTER_PLUS_EXPR : PLUS_EXPR;
2453           init_stmt = gimple_build_assign_with_ops (code, new_var,
2454                                                     new_name, step_expr);
2455           new_name = make_ssa_name (new_var, init_stmt);
2456           gimple_assign_set_lhs (init_stmt, new_name);
2457
2458           new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
2459           gcc_assert (!new_bb);
2460
2461           if (vect_print_dump_info (REPORT_DETAILS))
2462             {
2463               fprintf (vect_dump, "created new init_stmt: ");
2464               print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
2465             }
2466           t = tree_cons (NULL_TREE, new_name, t);
2467         }
2468       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
2469       vec = build_constructor_from_list (vectype, nreverse (t));
2470       vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
2471     }
2472
2473
2474   /* Create the vector that holds the step of the induction.  */
2475   if (nested_in_vect_loop)
2476     /* iv_loop is nested in the loop to be vectorized. Generate:
2477        vec_step = [S, S, S, S]  */
2478     new_name = step_expr;
2479   else
2480     {
2481       /* iv_loop is the loop to be vectorized. Generate:
2482           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
2483       expr = build_int_cst (TREE_TYPE (step_expr), vf);
2484       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2485                               expr, step_expr);
2486     }
2487
2488   t = NULL_TREE;
2489   for (i = 0; i < nunits; i++)
2490     t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2491   gcc_assert (CONSTANT_CLASS_P (new_name));
2492   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
2493   gcc_assert (stepvectype);
2494   vec = build_vector (stepvectype, t);
2495   vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2496
2497
2498   /* Create the following def-use cycle:
2499      loop prolog:
2500          vec_init = ...
2501          vec_step = ...
2502      loop:
2503          vec_iv = PHI <vec_init, vec_loop>
2504          ...
2505          STMT
2506          ...
2507          vec_loop = vec_iv + vec_step;  */
2508
2509   /* Create the induction-phi that defines the induction-operand.  */
2510   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
2511   add_referenced_var (vec_dest);
2512   induction_phi = create_phi_node (vec_dest, iv_loop->header);
2513   set_vinfo_for_stmt (induction_phi,
2514                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
2515   induc_def = PHI_RESULT (induction_phi);
2516
2517   /* Create the iv update inside the loop  */
2518   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2519                                            induc_def, vec_step);
2520   vec_def = make_ssa_name (vec_dest, new_stmt);
2521   gimple_assign_set_lhs (new_stmt, vec_def);
2522   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2523   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
2524                                                    NULL));
2525
2526   /* Set the arguments of the phi node:  */
2527   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
2528   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
2529                UNKNOWN_LOCATION);
2530
2531
2532   /* In case that vectorization factor (VF) is bigger than the number
2533      of elements that we can fit in a vectype (nunits), we have to generate
2534      more than one vector stmt - i.e - we need to "unroll" the
2535      vector stmt by a factor VF/nunits.  For more details see documentation
2536      in vectorizable_operation.  */
2537
2538   if (ncopies > 1)
2539     {
2540       stmt_vec_info prev_stmt_vinfo;
2541       /* FORNOW. This restriction should be relaxed.  */
2542       gcc_assert (!nested_in_vect_loop);
2543
2544       /* Create the vector that holds the step of the induction.  */
2545       expr = build_int_cst (TREE_TYPE (step_expr), nunits);
2546       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2547                               expr, step_expr);
2548       t = NULL_TREE;
2549       for (i = 0; i < nunits; i++)
2550         t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2551       gcc_assert (CONSTANT_CLASS_P (new_name));
2552       vec = build_vector (stepvectype, t);
2553       vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2554
2555       vec_def = induc_def;
2556       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
2557       for (i = 1; i < ncopies; i++)
2558         {
2559           /* vec_i = vec_prev + vec_step  */
2560           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2561                                                    vec_def, vec_step);
2562           vec_def = make_ssa_name (vec_dest, new_stmt);
2563           gimple_assign_set_lhs (new_stmt, vec_def);
2564
2565           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2566           set_vinfo_for_stmt (new_stmt,
2567                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2568           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
2569           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
2570         }
2571     }
2572
2573   if (nested_in_vect_loop)
2574     {
2575       /* Find the loop-closed exit-phi of the induction, and record
2576          the final vector of induction results:  */
2577       exit_phi = NULL;
2578       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
2579         {
2580           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
2581             {
2582               exit_phi = USE_STMT (use_p);
2583               break;
2584             }
2585         }
2586       if (exit_phi)
2587         {
2588           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2589           /* FORNOW. Currently not supporting the case that an inner-loop induction
2590              is not used in the outer-loop (i.e. only outside the outer-loop).  */
2591           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2592                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
2593
2594           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
2595           if (vect_print_dump_info (REPORT_DETAILS))
2596             {
2597               fprintf (vect_dump, "vector of inductions after inner-loop:");
2598               print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
2599             }
2600         }
2601     }
2602
2603
2604   if (vect_print_dump_info (REPORT_DETAILS))
2605     {
2606       fprintf (vect_dump, "transform induction: created def-use cycle: ");
2607       print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
2608       fprintf (vect_dump, "\n");
2609       print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
2610     }
2611
2612   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
2613   return induc_def;
2614 }
2615
2616
2617 /* Function get_initial_def_for_reduction
2618
2619    Input:
2620    STMT - a stmt that performs a reduction operation in the loop.
2621    INIT_VAL - the initial value of the reduction variable
2622
2623    Output:
2624    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2625         of the reduction (used for adjusting the epilog - see below).
2626    Return a vector variable, initialized according to the operation that STMT
2627         performs. This vector will be used as the initial value of the
2628         vector of partial results.
2629
2630    Option1 (adjust in epilog): Initialize the vector as follows:
2631      add/bit or/xor:    [0,0,...,0,0]
2632      mult/bit and:      [1,1,...,1,1]
2633      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
2634    and when necessary (e.g. add/mult case) let the caller know
2635    that it needs to adjust the result by init_val.
2636
2637    Option2: Initialize the vector as follows:
2638      add/bit or/xor:    [init_val,0,0,...,0]
2639      mult/bit and:      [init_val,1,1,...,1]
2640      min/max/cond_expr: [init_val,init_val,...,init_val]
2641    and no adjustments are needed.
2642
2643    For example, for the following code:
2644
2645    s = init_val;
2646    for (i=0;i<n;i++)
2647      s = s + a[i];
2648
2649    STMT is 's = s + a[i]', and the reduction variable is 's'.
2650    For a vector of 4 units, we want to return either [0,0,0,init_val],
2651    or [0,0,0,0] and let the caller know that it needs to adjust
2652    the result at the end by 'init_val'.
2653
2654    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2655    initialization vector is simpler (same element in all entries), if
2656    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
2657
2658    A cost model should help decide between these two schemes.  */
2659
2660 tree
2661 get_initial_def_for_reduction (gimple stmt, tree init_val,
2662                                tree *adjustment_def)
2663 {
2664   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2665   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2666   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2667   tree scalar_type = TREE_TYPE (init_val);
2668   tree vectype = get_vectype_for_scalar_type (scalar_type);
2669   int nunits;
2670   enum tree_code code = gimple_assign_rhs_code (stmt);
2671   tree def_for_init;
2672   tree init_def;
2673   tree t = NULL_TREE;
2674   int i;
2675   bool nested_in_vect_loop = false;
2676   tree init_value;
2677   REAL_VALUE_TYPE real_init_val = dconst0;
2678   int int_init_val = 0;
2679   gimple def_stmt = NULL;
2680
2681   gcc_assert (vectype);
2682   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2683
2684   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
2685               || SCALAR_FLOAT_TYPE_P (scalar_type));
2686
2687   if (nested_in_vect_loop_p (loop, stmt))
2688     nested_in_vect_loop = true;
2689   else
2690     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2691
2692   /* In case of double reduction we only create a vector variable to be put
2693      in the reduction phi node. The actual statement creation is done in
2694      vect_create_epilog_for_reduction.  */
2695   if (adjustment_def && nested_in_vect_loop
2696       && TREE_CODE (init_val) == SSA_NAME
2697       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
2698       && gimple_code (def_stmt) == GIMPLE_PHI
2699       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2700       && vinfo_for_stmt (def_stmt)
2701       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2702           == vect_double_reduction_def)
2703     {
2704       *adjustment_def = NULL;
2705       return vect_create_destination_var (init_val, vectype);
2706     }
2707
2708   if (TREE_CONSTANT (init_val))
2709     {
2710       if (SCALAR_FLOAT_TYPE_P (scalar_type))
2711         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
2712       else
2713         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
2714     }
2715   else
2716     init_value = init_val;
2717
2718   switch (code)
2719     {
2720       case WIDEN_SUM_EXPR:
2721       case DOT_PROD_EXPR:
2722       case PLUS_EXPR:
2723       case MINUS_EXPR:
2724       case BIT_IOR_EXPR:
2725       case BIT_XOR_EXPR:
2726       case MULT_EXPR:
2727       case BIT_AND_EXPR:
2728         /* ADJUSMENT_DEF is NULL when called from
2729            vect_create_epilog_for_reduction to vectorize double reduction.  */
2730         if (adjustment_def)
2731           {
2732             if (nested_in_vect_loop)
2733               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
2734                                                               NULL);
2735             else
2736               *adjustment_def = init_val;
2737           }
2738
2739         if (code == MULT_EXPR || code == BIT_AND_EXPR)
2740           {
2741             real_init_val = dconst1;
2742             int_init_val = 1;
2743           }
2744
2745         if (SCALAR_FLOAT_TYPE_P (scalar_type))
2746           def_for_init = build_real (scalar_type, real_init_val);
2747         else
2748           def_for_init = build_int_cst (scalar_type, int_init_val);
2749
2750         /* Create a vector of '0' or '1' except the first element.  */
2751         for (i = nunits - 2; i >= 0; --i)
2752           t = tree_cons (NULL_TREE, def_for_init, t);
2753
2754         /* Option1: the first element is '0' or '1' as well.  */
2755         if (adjustment_def)
2756           {
2757             t = tree_cons (NULL_TREE, def_for_init, t);
2758             init_def = build_vector (vectype, t);
2759             break;
2760           }
2761
2762         /* Option2: the first element is INIT_VAL.  */
2763         t = tree_cons (NULL_TREE, init_value, t);
2764         if (TREE_CONSTANT (init_val))
2765           init_def = build_vector (vectype, t);
2766         else
2767           init_def = build_constructor_from_list (vectype, t);
2768
2769         break;
2770
2771       case MIN_EXPR:
2772       case MAX_EXPR:
2773       case COND_EXPR:
2774         if (adjustment_def)
2775           {
2776             *adjustment_def = NULL_TREE;
2777             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2778             break;
2779           }
2780
2781         for (i = nunits - 1; i >= 0; --i)
2782           t = tree_cons (NULL_TREE, init_value, t);
2783
2784         if (TREE_CONSTANT (init_val))
2785           init_def = build_vector (vectype, t);
2786         else
2787           init_def = build_constructor_from_list (vectype, t);
2788
2789         break;
2790
2791       default:
2792         gcc_unreachable ();
2793     }
2794
2795   return init_def;
2796 }
2797
2798
2799 /* Function vect_create_epilog_for_reduction
2800
2801    Create code at the loop-epilog to finalize the result of a reduction
2802    computation.
2803
2804    VECT_DEF is a vector of partial results.
2805    REDUC_CODE is the tree-code for the epilog reduction.
2806    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
2807      number of elements that we can fit in a vectype (nunits). In this case
2808      we have to generate more than one vector stmt - i.e - we need to "unroll"
2809      the vector stmt by a factor VF/nunits.  For more details see documentation
2810      in vectorizable_operation.
2811    STMT is the scalar reduction stmt that is being vectorized.
2812    REDUCTION_PHI is the phi-node that carries the reduction computation.
2813    REDUC_INDEX is the index of the operand in the right hand side of the
2814      statement that is defined by REDUCTION_PHI.
2815    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
2816
2817    This function:
2818    1. Creates the reduction def-use cycle: sets the arguments for
2819       REDUCTION_PHI:
2820       The loop-entry argument is the vectorized initial-value of the reduction.
2821       The loop-latch argument is VECT_DEF - the vector of partial sums.
2822    2. "Reduces" the vector of partial results VECT_DEF into a single result,
2823       by applying the operation specified by REDUC_CODE if available, or by
2824       other means (whole-vector shifts or a scalar loop).
2825       The function also creates a new phi node at the loop exit to preserve
2826       loop-closed form, as illustrated below.
2827
2828      The flow at the entry to this function:
2829
2830         loop:
2831           vec_def = phi <null, null>            # REDUCTION_PHI
2832           VECT_DEF = vector_stmt                # vectorized form of STMT
2833           s_loop = scalar_stmt                  # (scalar) STMT
2834         loop_exit:
2835           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
2836           use <s_out0>
2837           use <s_out0>
2838
2839      The above is transformed by this function into:
2840
2841         loop:
2842           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
2843           VECT_DEF = vector_stmt                # vectorized form of STMT
2844           s_loop = scalar_stmt                  # (scalar) STMT
2845         loop_exit:
2846           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
2847           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
2848           v_out2 = reduce <v_out1>
2849           s_out3 = extract_field <v_out2, 0>
2850           s_out4 = adjust_result <s_out3>
2851           use <s_out4>
2852           use <s_out4>
2853 */
2854
2855 static void
2856 vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
2857                                   int ncopies,
2858                                   enum tree_code reduc_code,
2859                                   gimple reduction_phi,
2860                                   int reduc_index,
2861                                   bool double_reduc)
2862 {
2863   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2864   stmt_vec_info prev_phi_info;
2865   tree vectype;
2866   enum machine_mode mode;
2867   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2868   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
2869   basic_block exit_bb;
2870   tree scalar_dest;
2871   tree scalar_type;
2872   gimple new_phi = NULL, phi;
2873   gimple_stmt_iterator exit_gsi;
2874   tree vec_dest;
2875   tree new_temp = NULL_TREE;
2876   tree new_name;
2877   gimple epilog_stmt = NULL;
2878   tree new_scalar_dest, new_dest;
2879   gimple exit_phi;
2880   tree bitsize, bitpos, bytesize;
2881   enum tree_code code = gimple_assign_rhs_code (stmt);
2882   tree adjustment_def;
2883   tree vec_initial_def, def;
2884   tree orig_name;
2885   imm_use_iterator imm_iter;
2886   use_operand_p use_p;
2887   bool extract_scalar_result = false;
2888   tree reduction_op, expr;
2889   gimple orig_stmt;
2890   gimple use_stmt;
2891   bool nested_in_vect_loop = false;
2892   VEC(gimple,heap) *phis = NULL;
2893   enum vect_def_type dt = vect_unknown_def_type;
2894   int j, i;
2895
2896   if (nested_in_vect_loop_p (loop, stmt))
2897     {
2898       outer_loop = loop;
2899       loop = loop->inner;
2900       nested_in_vect_loop = true;
2901     }
2902
2903   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2904     {
2905     case GIMPLE_SINGLE_RHS:
2906       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
2907                                        == ternary_op);
2908       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
2909       break;
2910     case GIMPLE_UNARY_RHS:
2911       reduction_op = gimple_assign_rhs1 (stmt);
2912       break;
2913     case GIMPLE_BINARY_RHS:
2914       reduction_op = reduc_index ?
2915                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
2916       break;
2917     default:
2918       gcc_unreachable ();
2919     }
2920
2921   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2922   gcc_assert (vectype);
2923   mode = TYPE_MODE (vectype);
2924
2925   /*** 1. Create the reduction def-use cycle  ***/
2926
2927   /* For the case of reduction, vect_get_vec_def_for_operand returns
2928      the scalar def before the loop, that defines the initial value
2929      of the reduction variable.  */
2930   vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2931                                                   &adjustment_def);
2932
2933   phi = reduction_phi;
2934   def = vect_def;
2935   for (j = 0; j < ncopies; j++)
2936     {
2937       /* 1.1 set the loop-entry arg of the reduction-phi:  */
2938       add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop),
2939                    UNKNOWN_LOCATION);
2940
2941       /* 1.2 set the loop-latch arg for the reduction-phi:  */
2942       if (j > 0)
2943         def = vect_get_vec_def_for_stmt_copy (dt, def);
2944       add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
2945
2946       if (vect_print_dump_info (REPORT_DETAILS))
2947         {
2948           fprintf (vect_dump, "transform reduction: created def-use cycle: ");
2949           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
2950           fprintf (vect_dump, "\n");
2951           print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
2952         }
2953
2954       phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
2955     }
2956
2957   /*** 2. Create epilog code
2958           The reduction epilog code operates across the elements of the vector
2959           of partial results computed by the vectorized loop.
2960           The reduction epilog code consists of:
2961           step 1: compute the scalar result in a vector (v_out2)
2962           step 2: extract the scalar result (s_out3) from the vector (v_out2)
2963           step 3: adjust the scalar result (s_out3) if needed.
2964
2965           Step 1 can be accomplished using one the following three schemes:
2966           (scheme 1) using reduc_code, if available.
2967           (scheme 2) using whole-vector shifts, if available.
2968           (scheme 3) using a scalar loop. In this case steps 1+2 above are
2969                      combined.
2970
2971           The overall epilog code looks like this:
2972
2973           s_out0 = phi <s_loop>         # original EXIT_PHI
2974           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
2975           v_out2 = reduce <v_out1>              # step 1
2976           s_out3 = extract_field <v_out2, 0>    # step 2
2977           s_out4 = adjust_result <s_out3>       # step 3
2978
2979           (step 3 is optional, and steps 1 and 2 may be combined).
2980           Lastly, the uses of s_out0 are replaced by s_out4.
2981
2982           ***/
2983
2984   /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2985         v_out1 = phi <v_loop>  */
2986
2987   exit_bb = single_exit (loop)->dest;
2988   def = vect_def;
2989   prev_phi_info = NULL;
2990   for (j = 0; j < ncopies; j++)
2991     {
2992       phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
2993       set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
2994       if (j == 0)
2995         new_phi = phi;
2996       else
2997         {
2998           def = vect_get_vec_def_for_stmt_copy (dt, def);
2999           STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3000         }
3001       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3002       prev_phi_info = vinfo_for_stmt (phi);
3003     }
3004
3005   exit_gsi = gsi_after_labels (exit_bb);
3006
3007   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3008          (i.e. when reduc_code is not available) and in the final adjustment
3009          code (if needed).  Also get the original scalar reduction variable as
3010          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
3011          represents a reduction pattern), the tree-code and scalar-def are
3012          taken from the original stmt that the pattern-stmt (STMT) replaces.
3013          Otherwise (it is a regular reduction) - the tree-code and scalar-def
3014          are taken from STMT.  */
3015
3016   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3017   if (!orig_stmt)
3018     {
3019       /* Regular reduction  */
3020       orig_stmt = stmt;
3021     }
3022   else
3023     {
3024       /* Reduction pattern  */
3025       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3026       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3027       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3028     }
3029
3030   code = gimple_assign_rhs_code (orig_stmt);
3031   scalar_dest = gimple_assign_lhs (orig_stmt);
3032   scalar_type = TREE_TYPE (scalar_dest);
3033   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3034   bitsize = TYPE_SIZE (scalar_type);
3035   bytesize = TYPE_SIZE_UNIT (scalar_type);
3036
3037   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3038      partial results are added and not subtracted.  */
3039   if (code == MINUS_EXPR)
3040     code = PLUS_EXPR;
3041
3042   /* In case this is a reduction in an inner-loop while vectorizing an outer
3043      loop - we don't need to extract a single scalar result at the end of the
3044      inner-loop (unless it is double reduction, i.e., the use of reduction is
3045      outside the outer-loop). The final vector of partial results will be used
3046      in the vectorized outer-loop, or reduced to a scalar result at the end of
3047      the outer-loop.  */
3048   if (nested_in_vect_loop && !double_reduc)
3049     goto vect_finalize_reduction;
3050
3051   /* The epilogue is created for the outer-loop, i.e., for the loop being
3052      vectorized.  */
3053   if (double_reduc)
3054     loop = outer_loop;
3055
3056   /* FORNOW */
3057   gcc_assert (ncopies == 1);
3058
3059   /* 2.3 Create the reduction code, using one of the three schemes described
3060          above.  */
3061
3062   if (reduc_code != ERROR_MARK)
3063     {
3064       tree tmp;
3065
3066       /*** Case 1:  Create:
3067            v_out2 = reduc_expr <v_out1>  */
3068
3069       if (vect_print_dump_info (REPORT_DETAILS))
3070         fprintf (vect_dump, "Reduce using direct vector reduction.");
3071
3072       vec_dest = vect_create_destination_var (scalar_dest, vectype);
3073       tmp = build1 (reduc_code, vectype,  PHI_RESULT (new_phi));
3074       epilog_stmt = gimple_build_assign (vec_dest, tmp);
3075       new_temp = make_ssa_name (vec_dest, epilog_stmt);
3076       gimple_assign_set_lhs (epilog_stmt, new_temp);
3077       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3078
3079       extract_scalar_result = true;
3080     }
3081   else
3082     {
3083       enum tree_code shift_code = ERROR_MARK;
3084       bool have_whole_vector_shift = true;
3085       int bit_offset;
3086       int element_bitsize = tree_low_cst (bitsize, 1);
3087       int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3088       tree vec_temp;
3089
3090       if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
3091         shift_code = VEC_RSHIFT_EXPR;
3092       else
3093         have_whole_vector_shift = false;
3094
3095       /* Regardless of whether we have a whole vector shift, if we're
3096          emulating the operation via tree-vect-generic, we don't want
3097          to use it.  Only the first round of the reduction is likely
3098          to still be profitable via emulation.  */
3099       /* ??? It might be better to emit a reduction tree code here, so that
3100          tree-vect-generic can expand the first round via bit tricks.  */
3101       if (!VECTOR_MODE_P (mode))
3102         have_whole_vector_shift = false;
3103       else
3104         {
3105           optab optab = optab_for_tree_code (code, vectype, optab_default);
3106           if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
3107             have_whole_vector_shift = false;
3108         }
3109
3110       if (have_whole_vector_shift)
3111         {
3112           /*** Case 2: Create:
3113              for (offset = VS/2; offset >= element_size; offset/=2)
3114                 {
3115                   Create:  va' = vec_shift <va, offset>
3116                   Create:  va = vop <va, va'>
3117                 }  */
3118
3119           if (vect_print_dump_info (REPORT_DETAILS))
3120             fprintf (vect_dump, "Reduce using vector shifts");
3121
3122           vec_dest = vect_create_destination_var (scalar_dest, vectype);
3123           new_temp = PHI_RESULT (new_phi);
3124
3125           for (bit_offset = vec_size_in_bits/2;
3126                bit_offset >= element_bitsize;
3127                bit_offset /= 2)
3128             {
3129               tree bitpos = size_int (bit_offset);
3130
3131               epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest,
3132                                                           new_temp, bitpos);
3133               new_name = make_ssa_name (vec_dest, epilog_stmt);
3134               gimple_assign_set_lhs (epilog_stmt, new_name);
3135               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3136
3137               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3138                                                           new_name, new_temp);
3139               new_temp = make_ssa_name (vec_dest, epilog_stmt);
3140               gimple_assign_set_lhs (epilog_stmt, new_temp);
3141               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3142             }
3143
3144           extract_scalar_result = true;
3145         }
3146       else
3147         {
3148           tree rhs;
3149
3150           /*** Case 3: Create:
3151              s = extract_field <v_out2, 0>
3152              for (offset = element_size;
3153                   offset < vector_size;
3154                   offset += element_size;)
3155                {
3156                  Create:  s' = extract_field <v_out2, offset>
3157                  Create:  s = op <s, s'>
3158                }  */
3159
3160           if (vect_print_dump_info (REPORT_DETAILS))
3161             fprintf (vect_dump, "Reduce using scalar code. ");
3162
3163           vec_temp = PHI_RESULT (new_phi);
3164           vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3165           rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3166                          bitsize_zero_node);
3167           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3168           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3169           gimple_assign_set_lhs (epilog_stmt, new_temp);
3170           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3171
3172           for (bit_offset = element_bitsize;
3173                bit_offset < vec_size_in_bits;
3174                bit_offset += element_bitsize)
3175             {
3176               tree bitpos = bitsize_int (bit_offset);
3177               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3178                                  bitpos);
3179
3180               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3181               new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3182               gimple_assign_set_lhs (epilog_stmt, new_name);
3183               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3184
3185               epilog_stmt = gimple_build_assign_with_ops (code,
3186                                                           new_scalar_dest,
3187                                                           new_name, new_temp);
3188               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3189               gimple_assign_set_lhs (epilog_stmt, new_temp);
3190               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3191             }
3192
3193           extract_scalar_result = false;
3194         }
3195     }
3196
3197   /* 2.4  Extract the final scalar result.  Create:
3198          s_out3 = extract_field <v_out2, bitpos>  */
3199
3200   if (extract_scalar_result)
3201     {
3202       tree rhs;
3203
3204       gcc_assert (!nested_in_vect_loop || double_reduc);
3205       if (vect_print_dump_info (REPORT_DETAILS))
3206         fprintf (vect_dump, "extract scalar result");
3207
3208       if (BYTES_BIG_ENDIAN)
3209         bitpos = size_binop (MULT_EXPR,
3210                        bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3211                        TYPE_SIZE (scalar_type));
3212       else
3213         bitpos = bitsize_zero_node;
3214
3215       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3216       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3217       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3218       gimple_assign_set_lhs (epilog_stmt, new_temp);
3219       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3220     }
3221
3222 vect_finalize_reduction:
3223
3224   if (double_reduc)
3225     loop = loop->inner;
3226
3227   /* 2.5 Adjust the final result by the initial value of the reduction
3228          variable. (When such adjustment is not needed, then
3229          'adjustment_def' is zero).  For example, if code is PLUS we create:
3230          new_temp = loop_exit_def + adjustment_def  */
3231
3232   if (adjustment_def)
3233     {
3234       if (nested_in_vect_loop)
3235         {
3236           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3237           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3238           new_dest = vect_create_destination_var (scalar_dest, vectype);
3239         }
3240       else
3241         {
3242           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3243           expr = build2 (code, scalar_type, new_temp, adjustment_def);
3244           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3245         }
3246
3247       epilog_stmt = gimple_build_assign (new_dest, expr);
3248       new_temp = make_ssa_name (new_dest, epilog_stmt);
3249       gimple_assign_set_lhs (epilog_stmt, new_temp);
3250       SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3251       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3252     }
3253
3254
3255   /* 2.6  Handle the loop-exit phi  */
3256
3257   /* Replace uses of s_out0 with uses of s_out3:
3258      Find the loop-closed-use at the loop exit of the original scalar result.
3259      (The reduction result is expected to have two immediate uses - one at the
3260      latch block, and one at the loop exit).  */
3261   phis = VEC_alloc (gimple, heap, 10);
3262   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3263     {
3264       if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3265         {
3266           exit_phi = USE_STMT (use_p);
3267           VEC_quick_push (gimple, phis, exit_phi);
3268         }
3269     }
3270
3271   /* We expect to have found an exit_phi because of loop-closed-ssa form.  */
3272   gcc_assert (!VEC_empty (gimple, phis));
3273
3274   for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
3275     {
3276       if (nested_in_vect_loop)
3277         {
3278           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3279           gimple vect_phi;
3280
3281           /* FORNOW. Currently not supporting the case that an inner-loop
3282              reduction is not used in the outer-loop (but only outside the
3283              outer-loop), unless it is double reduction.  */
3284           gcc_assert ((STMT_VINFO_RELEVANT_P (stmt_vinfo)
3285                       && !STMT_VINFO_LIVE_P (stmt_vinfo)) || double_reduc);
3286
3287           epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
3288           STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
3289           set_vinfo_for_stmt (epilog_stmt,
3290                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
3291                                                  NULL));
3292           if (adjustment_def)
3293             STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3294                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3295
3296           if (!double_reduc
3297               || STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_double_reduction_def)
3298             continue;
3299
3300           /* Handle double reduction:
3301
3302              stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
3303              stmt2:   s3 = phi <s1, s4> - (regular) reduction phi (inner loop)
3304              stmt3:   s4 = use (s3)     - (regular) reduction stmt (inner loop)
3305              stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
3306
3307              At that point the regular reduction (stmt2 and stmt3) is already
3308              vectorized, as well as the exit phi node, stmt4.
3309              Here we vectorize the phi node of double reduction, stmt1, and
3310              update all relevant statements.  */
3311
3312           /* Go through all the uses of s2 to find double reduction phi node,
3313              i.e., stmt1 above.  */
3314           orig_name = PHI_RESULT (exit_phi);
3315           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3316             {
3317               stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
3318               stmt_vec_info new_phi_vinfo;
3319               tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
3320               basic_block bb = gimple_bb (use_stmt);
3321               gimple use;
3322
3323               /* Check that USE_STMT is really double reduction phi node.  */
3324               if (gimple_code (use_stmt) != GIMPLE_PHI
3325                   || gimple_phi_num_args (use_stmt) != 2
3326                   || !use_stmt_vinfo
3327                   || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
3328                       != vect_double_reduction_def
3329                   || bb->loop_father != outer_loop)
3330                 continue;
3331
3332               /* Create vector phi node for double reduction:
3333                  vs1 = phi <vs0, vs2>
3334                  vs1 was created previously in this function by a call to
3335                  vect_get_vec_def_for_operand and is stored in vec_initial_def;
3336                  vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
3337                  vs0 is created here.  */
3338
3339               /* Create vector phi node.  */
3340               vect_phi = create_phi_node (vec_initial_def, bb);
3341               new_phi_vinfo = new_stmt_vec_info (vect_phi,
3342                                     loop_vec_info_for_loop (outer_loop), NULL);
3343               set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
3344
3345               /* Create vs0 - initial def of the double reduction phi.  */
3346               preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
3347                                              loop_preheader_edge (outer_loop));
3348               init_def = get_initial_def_for_reduction (stmt, preheader_arg,
3349                                                         NULL);
3350               vect_phi_init = vect_init_vector (use_stmt, init_def, vectype,
3351                                                 NULL);
3352
3353               /* Update phi node arguments with vs0 and vs2.  */
3354               add_phi_arg (vect_phi, vect_phi_init,
3355                            loop_preheader_edge (outer_loop), UNKNOWN_LOCATION);
3356               add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
3357                            loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
3358               if (vect_print_dump_info (REPORT_DETAILS))
3359                 {
3360                   fprintf (vect_dump, "created double reduction phi node: ");
3361                   print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
3362                 }
3363
3364               vect_phi_res = PHI_RESULT (vect_phi);
3365
3366               /* Replace the use, i.e., set the correct vs1 in the regular
3367                  reduction phi node. FORNOW, NCOPIES is always 1, so the loop
3368                  is redundant.  */
3369               use = reduction_phi;
3370               for (j = 0; j < ncopies; j++)
3371                 {
3372                   edge pr_edge = loop_preheader_edge (loop);
3373                   SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
3374                   use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
3375                 }
3376             }
3377         }
3378
3379       /* Replace the uses:  */
3380       orig_name = PHI_RESULT (exit_phi);
3381       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3382         FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3383           SET_USE (use_p, new_temp);
3384     }
3385
3386   VEC_free (gimple, heap, phis);
3387 }
3388
3389
3390 /* Function vectorizable_reduction.
3391
3392    Check if STMT performs a reduction operation that can be vectorized.
3393    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3394    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
3395    Return FALSE if not a vectorizable STMT, TRUE otherwise.
3396
3397    This function also handles reduction idioms (patterns) that have been
3398    recognized in advance during vect_pattern_recog. In this case, STMT may be
3399    of this form:
3400      X = pattern_expr (arg0, arg1, ..., X)
3401    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
3402    sequence that had been detected and replaced by the pattern-stmt (STMT).
3403
3404    In some cases of reduction patterns, the type of the reduction variable X is
3405    different than the type of the other arguments of STMT.
3406    In such cases, the vectype that is used when transforming STMT into a vector
3407    stmt is different than the vectype that is used to determine the
3408    vectorization factor, because it consists of a different number of elements
3409    than the actual number of elements that are being operated upon in parallel.
3410
3411    For example, consider an accumulation of shorts into an int accumulator.
3412    On some targets it's possible to vectorize this pattern operating on 8
3413    shorts at a time (hence, the vectype for purposes of determining the
3414    vectorization factor should be V8HI); on the other hand, the vectype that
3415    is used to create the vector form is actually V4SI (the type of the result).
3416
3417    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
3418    indicates what is the actual level of parallelism (V8HI in the example), so
3419    that the right vectorization factor would be derived. This vectype
3420    corresponds to the type of arguments to the reduction stmt, and should *NOT*
3421    be used to create the vectorized stmt. The right vectype for the vectorized
3422    stmt is obtained from the type of the result X:
3423         get_vectype_for_scalar_type (TREE_TYPE (X))
3424
3425    This means that, contrary to "regular" reductions (or "regular" stmts in
3426    general), the following equation:
3427       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
3428    does *NOT* necessarily hold for reduction patterns.  */
3429
3430 bool
3431 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
3432                         gimple *vec_stmt)
3433 {
3434   tree vec_dest;
3435   tree scalar_dest;
3436   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
3437   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3438   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3439   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3440   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3441   enum tree_code code, orig_code, epilog_reduc_code;
3442   enum machine_mode vec_mode;
3443   int op_type;
3444   optab optab, reduc_optab;
3445   tree new_temp = NULL_TREE;
3446   tree def;
3447   gimple def_stmt;
3448   enum vect_def_type dt;
3449   gimple new_phi = NULL;
3450   tree scalar_type;
3451   bool is_simple_use;
3452   gimple orig_stmt;
3453   stmt_vec_info orig_stmt_info;
3454   tree expr = NULL_TREE;
3455   int i;
3456   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3457   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3458   int epilog_copies;
3459   stmt_vec_info prev_stmt_info, prev_phi_info;
3460   gimple first_phi = NULL;
3461   bool single_defuse_cycle = false;
3462   tree reduc_def = NULL_TREE;
3463   gimple new_stmt = NULL;
3464   int j;
3465   tree ops[3];
3466   bool nested_cycle = false, found_nested_cycle_def = false;
3467   gimple reduc_def_stmt = NULL;
3468   /* The default is that the reduction variable is the last in statement.  */
3469   int reduc_index = 2;
3470   bool double_reduc = false, dummy;
3471   basic_block def_bb;
3472   struct loop * def_stmt_loop, *outer_loop = NULL;
3473   tree def_arg;
3474   gimple def_arg_stmt;
3475
3476   if (nested_in_vect_loop_p (loop, stmt))
3477     {
3478       outer_loop = loop;
3479       loop = loop->inner;
3480       nested_cycle = true;
3481     }
3482
3483   gcc_assert (ncopies >= 1);
3484
3485   /* FORNOW: SLP not supported.  */
3486   if (STMT_SLP_TYPE (stmt_info))
3487     return false;
3488
3489   /* 1. Is vectorizable reduction?  */
3490   /* Not supportable if the reduction variable is used in the loop.  */
3491   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
3492     return false;
3493
3494   /* Reductions that are not used even in an enclosing outer-loop,
3495      are expected to be "live" (used out of the loop).  */
3496   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
3497       && !STMT_VINFO_LIVE_P (stmt_info))
3498     return false;
3499
3500   /* Make sure it was already recognized as a reduction computation.  */
3501   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
3502       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
3503     return false;
3504
3505   /* 2. Has this been recognized as a reduction pattern?
3506
3507      Check if STMT represents a pattern that has been recognized
3508      in earlier analysis stages.  For stmts that represent a pattern,
3509      the STMT_VINFO_RELATED_STMT field records the last stmt in
3510      the original sequence that constitutes the pattern.  */
3511
3512   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3513   if (orig_stmt)
3514     {
3515       orig_stmt_info = vinfo_for_stmt (orig_stmt);
3516       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
3517       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
3518       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
3519     }
3520
3521   /* 3. Check the operands of the operation. The first operands are defined
3522         inside the loop body. The last operand is the reduction variable,
3523         which is defined by the loop-header-phi.  */
3524
3525   gcc_assert (is_gimple_assign (stmt));
3526
3527   /* Flatten RHS */
3528   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3529     {
3530     case GIMPLE_SINGLE_RHS:
3531       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
3532       if (op_type == ternary_op)
3533         {
3534           tree rhs = gimple_assign_rhs1 (stmt);
3535           ops[0] = TREE_OPERAND (rhs, 0);
3536           ops[1] = TREE_OPERAND (rhs, 1);
3537           ops[2] = TREE_OPERAND (rhs, 2);
3538           code = TREE_CODE (rhs);
3539         }
3540       else
3541         return false;
3542       break;
3543
3544     case GIMPLE_BINARY_RHS:
3545       code = gimple_assign_rhs_code (stmt);
3546       op_type = TREE_CODE_LENGTH (code);
3547       gcc_assert (op_type == binary_op);
3548       ops[0] = gimple_assign_rhs1 (stmt);
3549       ops[1] = gimple_assign_rhs2 (stmt);
3550       break;
3551
3552     case GIMPLE_UNARY_RHS:
3553       return false;
3554
3555     default:
3556       gcc_unreachable ();
3557     }
3558
3559   scalar_dest = gimple_assign_lhs (stmt);
3560   scalar_type = TREE_TYPE (scalar_dest);
3561   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
3562       && !SCALAR_FLOAT_TYPE_P (scalar_type))
3563     return false;
3564
3565   /* All uses but the last are expected to be defined in the loop.
3566      The last use is the reduction variable. In case of nested cycle this
3567      assumption is not true: we use reduc_index to record the index of the
3568      reduction variable.  */
3569   for (i = 0; i < op_type-1; i++)
3570     {
3571       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
3572       if (i == 0 && code == COND_EXPR)
3573         continue;
3574
3575       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
3576                                           &def, &dt);
3577       gcc_assert (is_simple_use);
3578       if (dt != vect_internal_def
3579           && dt != vect_external_def
3580           && dt != vect_constant_def
3581           && dt != vect_induction_def
3582           && !(dt == vect_nested_cycle && nested_cycle))
3583         return false;
3584
3585       if (dt == vect_nested_cycle)
3586         {
3587           found_nested_cycle_def = true;
3588           reduc_def_stmt = def_stmt;
3589           reduc_index = i;
3590         }
3591     }
3592
3593   is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
3594                                       &def, &dt);
3595   gcc_assert (is_simple_use);
3596   gcc_assert (dt == vect_reduction_def
3597               || dt == vect_nested_cycle
3598               || ((dt == vect_internal_def || dt == vect_external_def
3599                    || dt == vect_constant_def || dt == vect_induction_def)
3600                    && nested_cycle && found_nested_cycle_def));
3601   if (!found_nested_cycle_def)
3602     reduc_def_stmt = def_stmt;
3603
3604   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
3605   if (orig_stmt)
3606     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
3607                                                        reduc_def_stmt,
3608                                                        !nested_cycle,
3609                                                        &dummy));
3610   else
3611     gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
3612                                                   !nested_cycle, &dummy));
3613
3614   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
3615     return false;
3616
3617   vec_mode = TYPE_MODE (vectype);
3618
3619   if (code == COND_EXPR)
3620     {
3621       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0))
3622         {
3623           if (vect_print_dump_info (REPORT_DETAILS))
3624             fprintf (vect_dump, "unsupported condition in reduction");
3625
3626             return false;
3627         }
3628     }
3629   else
3630     {
3631       /* 4. Supportable by target?  */
3632
3633       /* 4.1. check support for the operation in the loop  */
3634       optab = optab_for_tree_code (code, vectype, optab_default);
3635       if (!optab)
3636         {
3637           if (vect_print_dump_info (REPORT_DETAILS))
3638             fprintf (vect_dump, "no optab.");
3639
3640           return false;
3641         }
3642
3643       if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
3644         {
3645           if (vect_print_dump_info (REPORT_DETAILS))
3646             fprintf (vect_dump, "op not supported by target.");
3647
3648           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
3649               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3650                   < vect_min_worthwhile_factor (code))
3651             return false;
3652
3653           if (vect_print_dump_info (REPORT_DETAILS))
3654             fprintf (vect_dump, "proceeding using word mode.");
3655         }
3656
3657       /* Worthwhile without SIMD support?  */
3658       if (!VECTOR_MODE_P (TYPE_MODE (vectype))
3659           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3660              < vect_min_worthwhile_factor (code))
3661         {
3662           if (vect_print_dump_info (REPORT_DETAILS))
3663             fprintf (vect_dump, "not worthwhile without SIMD support.");
3664
3665           return false;
3666         }
3667     }
3668
3669   /* 4.2. Check support for the epilog operation.
3670
3671           If STMT represents a reduction pattern, then the type of the
3672           reduction variable may be different than the type of the rest
3673           of the arguments.  For example, consider the case of accumulation
3674           of shorts into an int accumulator; The original code:
3675                         S1: int_a = (int) short_a;
3676           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
3677
3678           was replaced with:
3679                         STMT: int_acc = widen_sum <short_a, int_acc>
3680
3681           This means that:
3682           1. The tree-code that is used to create the vector operation in the
3683              epilog code (that reduces the partial results) is not the
3684              tree-code of STMT, but is rather the tree-code of the original
3685              stmt from the pattern that STMT is replacing. I.e, in the example
3686              above we want to use 'widen_sum' in the loop, but 'plus' in the
3687              epilog.
3688           2. The type (mode) we use to check available target support
3689              for the vector operation to be created in the *epilog*, is
3690              determined by the type of the reduction variable (in the example
3691              above we'd check this: plus_optab[vect_int_mode]).
3692              However the type (mode) we use to check available target support
3693              for the vector operation to be created *inside the loop*, is
3694              determined by the type of the other arguments to STMT (in the
3695              example we'd check this: widen_sum_optab[vect_short_mode]).
3696
3697           This is contrary to "regular" reductions, in which the types of all
3698           the arguments are the same as the type of the reduction variable.
3699           For "regular" reductions we can therefore use the same vector type
3700           (and also the same tree-code) when generating the epilog code and
3701           when generating the code inside the loop.  */
3702
3703   if (orig_stmt)
3704     {
3705       /* This is a reduction pattern: get the vectype from the type of the
3706          reduction variable, and get the tree-code from orig_stmt.  */
3707       orig_code = gimple_assign_rhs_code (orig_stmt);
3708       vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
3709       if (!vectype)
3710         {
3711           if (vect_print_dump_info (REPORT_DETAILS))
3712             {
3713               fprintf (vect_dump, "unsupported data-type ");
3714               print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
3715             }
3716           return false;
3717         }
3718
3719       vec_mode = TYPE_MODE (vectype);
3720     }
3721   else
3722     {
3723       /* Regular reduction: use the same vectype and tree-code as used for
3724          the vector code inside the loop can be used for the epilog code. */
3725       orig_code = code;
3726     }
3727
3728   if (nested_cycle)
3729     {
3730       def_bb = gimple_bb (reduc_def_stmt);
3731       def_stmt_loop = def_bb->loop_father;
3732       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
3733                                        loop_preheader_edge (def_stmt_loop));
3734       if (TREE_CODE (def_arg) == SSA_NAME
3735           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
3736           && gimple_code (def_arg_stmt) == GIMPLE_PHI
3737           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
3738           && vinfo_for_stmt (def_arg_stmt)
3739           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
3740               == vect_double_reduction_def)
3741         double_reduc = true;
3742     }
3743
3744   epilog_reduc_code = ERROR_MARK;
3745   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
3746     {
3747       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype,
3748                                          optab_default);
3749       if (!reduc_optab)
3750         {
3751           if (vect_print_dump_info (REPORT_DETAILS))
3752             fprintf (vect_dump, "no optab for reduction.");
3753
3754           epilog_reduc_code = ERROR_MARK;
3755         }
3756
3757       if (reduc_optab
3758           && optab_handler (reduc_optab, vec_mode)->insn_code
3759               == CODE_FOR_nothing)
3760         {
3761           if (vect_print_dump_info (REPORT_DETAILS))
3762             fprintf (vect_dump, "reduc op not supported by target.");
3763
3764           epilog_reduc_code = ERROR_MARK;
3765         }
3766     }
3767   else
3768     {
3769       if (!nested_cycle || double_reduc)
3770         {
3771           if (vect_print_dump_info (REPORT_DETAILS))
3772             fprintf (vect_dump, "no reduc code for scalar code.");
3773
3774           return false;
3775         }
3776     }
3777
3778   if (double_reduc && ncopies > 1)
3779     {
3780       if (vect_print_dump_info (REPORT_DETAILS))
3781         fprintf (vect_dump, "multiple types in double reduction");
3782
3783       return false;
3784     }
3785
3786   if (!vec_stmt) /* transformation not required.  */
3787     {
3788       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3789       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
3790         return false;
3791       return true;
3792     }
3793
3794   /** Transform.  **/
3795
3796   if (vect_print_dump_info (REPORT_DETAILS))
3797     fprintf (vect_dump, "transform reduction.");
3798
3799   /* FORNOW: Multiple types are not supported for condition.  */
3800   if (code == COND_EXPR)
3801     gcc_assert (ncopies == 1);
3802
3803   /* Create the destination vector  */
3804   vec_dest = vect_create_destination_var (scalar_dest, vectype);
3805
3806   /* In case the vectorization factor (VF) is bigger than the number
3807      of elements that we can fit in a vectype (nunits), we have to generate
3808      more than one vector stmt - i.e - we need to "unroll" the
3809      vector stmt by a factor VF/nunits.  For more details see documentation
3810      in vectorizable_operation.  */
3811
3812   /* If the reduction is used in an outer loop we need to generate
3813      VF intermediate results, like so (e.g. for ncopies=2):
3814         r0 = phi (init, r0)
3815         r1 = phi (init, r1)
3816         r0 = x0 + r0;
3817         r1 = x1 + r1;
3818     (i.e. we generate VF results in 2 registers).
3819     In this case we have a separate def-use cycle for each copy, and therefore
3820     for each copy we get the vector def for the reduction variable from the
3821     respective phi node created for this copy.
3822
3823     Otherwise (the reduction is unused in the loop nest), we can combine
3824     together intermediate results, like so (e.g. for ncopies=2):
3825         r = phi (init, r)
3826         r = x0 + r;
3827         r = x1 + r;
3828    (i.e. we generate VF/2 results in a single register).
3829    In this case for each copy we get the vector def for the reduction variable
3830    from the vectorized reduction operation generated in the previous iteration.
3831   */
3832
3833   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
3834     {
3835       single_defuse_cycle = true;
3836       epilog_copies = 1;
3837     }
3838   else
3839     epilog_copies = ncopies;
3840
3841   prev_stmt_info = NULL;
3842   prev_phi_info = NULL;
3843   for (j = 0; j < ncopies; j++)
3844     {
3845       if (j == 0 || !single_defuse_cycle)
3846         {
3847           /* Create the reduction-phi that defines the reduction-operand.  */
3848           new_phi = create_phi_node (vec_dest, loop->header);
3849           set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo,
3850                                                           NULL));
3851           /* Get the vector def for the reduction variable from the phi
3852              node.  */
3853           reduc_def = PHI_RESULT (new_phi);
3854         }
3855
3856       if (code == COND_EXPR)
3857         {
3858           first_phi = new_phi;
3859           vectorizable_condition (stmt, gsi, vec_stmt, reduc_def, reduc_index);
3860           /* Multiple types are not supported for condition.  */
3861           break;
3862         }
3863
3864       /* Handle uses.  */
3865       if (j == 0)
3866         {
3867           loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
3868                                                         stmt, NULL);
3869           if (op_type == ternary_op)
3870             {
3871               if (reduc_index == 0)
3872                 loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt,
3873                                                               NULL);
3874               else
3875                 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt,
3876                                                               NULL);
3877             }
3878
3879           /* Get the vector def for the reduction variable from the phi
3880              node.  */
3881           first_phi = new_phi;
3882         }
3883       else
3884         {
3885           enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
3886           loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
3887           if (op_type == ternary_op)
3888             loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
3889
3890           if (single_defuse_cycle)
3891             reduc_def = gimple_assign_lhs (new_stmt);
3892           else
3893             reduc_def = PHI_RESULT (new_phi);
3894
3895           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
3896         }
3897
3898       /* Arguments are ready. Create the new vector stmt.  */
3899       if (op_type == binary_op)
3900         {
3901           if (reduc_index == 0)
3902             expr = build2 (code, vectype, reduc_def, loop_vec_def0);
3903           else
3904             expr = build2 (code, vectype, loop_vec_def0, reduc_def);
3905         }
3906       else
3907         {
3908           if (reduc_index == 0)
3909             expr = build3 (code, vectype, reduc_def, loop_vec_def0,
3910                            loop_vec_def1);
3911           else
3912             {
3913               if (reduc_index == 1)
3914                 expr = build3 (code, vectype, loop_vec_def0, reduc_def,
3915                                loop_vec_def1);
3916               else
3917                 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
3918                                reduc_def);
3919             }
3920         }
3921
3922       new_stmt = gimple_build_assign (vec_dest, expr);
3923       new_temp = make_ssa_name (vec_dest, new_stmt);
3924       gimple_assign_set_lhs (new_stmt, new_temp);
3925       vect_finish_stmt_generation (stmt, new_stmt, gsi);
3926
3927       if (j == 0)
3928         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3929       else
3930         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3931
3932       prev_stmt_info = vinfo_for_stmt (new_stmt);
3933       prev_phi_info = vinfo_for_stmt (new_phi);
3934     }
3935
3936   /* Finalize the reduction-phi (set its arguments) and create the
3937      epilog reduction code.  */
3938   if (!single_defuse_cycle || code == COND_EXPR)
3939     new_temp = gimple_assign_lhs (*vec_stmt);
3940
3941   vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies,
3942                                     epilog_reduc_code, first_phi, reduc_index,
3943                                     double_reduc);
3944   return true;
3945 }
3946
3947 /* Function vect_min_worthwhile_factor.
3948
3949    For a loop where we could vectorize the operation indicated by CODE,
3950    return the minimum vectorization factor that makes it worthwhile
3951    to use generic vectors.  */
3952 int
3953 vect_min_worthwhile_factor (enum tree_code code)
3954 {
3955   switch (code)
3956     {
3957     case PLUS_EXPR:
3958     case MINUS_EXPR:
3959     case NEGATE_EXPR:
3960       return 4;
3961
3962     case BIT_AND_EXPR:
3963     case BIT_IOR_EXPR:
3964     case BIT_XOR_EXPR:
3965     case BIT_NOT_EXPR:
3966       return 2;
3967
3968     default:
3969       return INT_MAX;
3970     }
3971 }
3972
3973
3974 /* Function vectorizable_induction
3975
3976    Check if PHI performs an induction computation that can be vectorized.
3977    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3978    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3979    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
3980
3981 bool
3982 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
3983                         gimple *vec_stmt)
3984 {
3985   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3986   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3987   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3988   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3989   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3990   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3991   tree vec_def;
3992
3993   gcc_assert (ncopies >= 1);
3994   /* FORNOW. This restriction should be relaxed.  */
3995   if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
3996     {
3997       if (vect_print_dump_info (REPORT_DETAILS))
3998         fprintf (vect_dump, "multiple types in nested loop.");
3999       return false;
4000     }
4001
4002   if (!STMT_VINFO_RELEVANT_P (stmt_info))
4003     return false;
4004
4005   /* FORNOW: SLP not supported.  */
4006   if (STMT_SLP_TYPE (stmt_info))
4007     return false;
4008
4009   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
4010
4011   if (gimple_code (phi) != GIMPLE_PHI)
4012     return false;
4013
4014   if (!vec_stmt) /* transformation not required.  */
4015     {
4016       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
4017       if (vect_print_dump_info (REPORT_DETAILS))
4018         fprintf (vect_dump, "=== vectorizable_induction ===");
4019       vect_model_induction_cost (stmt_info, ncopies);
4020       return true;
4021     }
4022
4023   /** Transform.  **/
4024
4025   if (vect_print_dump_info (REPORT_DETAILS))
4026     fprintf (vect_dump, "transform induction phi.");
4027
4028   vec_def = get_initial_def_for_induction (phi);
4029   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
4030   return true;
4031 }
4032
4033 /* Function vectorizable_live_operation.
4034
4035    STMT computes a value that is used outside the loop. Check if
4036    it can be supported.  */
4037
4038 bool
4039 vectorizable_live_operation (gimple stmt,
4040                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4041                              gimple *vec_stmt ATTRIBUTE_UNUSED)
4042 {
4043   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4044   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4045   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4046   int i;
4047   int op_type;
4048   tree op;
4049   tree def;
4050   gimple def_stmt;
4051   enum vect_def_type dt;
4052   enum tree_code code;
4053   enum gimple_rhs_class rhs_class;
4054
4055   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
4056
4057   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
4058     return false;
4059
4060   if (!is_gimple_assign (stmt))
4061     return false;
4062
4063   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4064     return false;
4065
4066   /* FORNOW. CHECKME. */
4067   if (nested_in_vect_loop_p (loop, stmt))
4068     return false;
4069
4070   code = gimple_assign_rhs_code (stmt);
4071   op_type = TREE_CODE_LENGTH (code);
4072   rhs_class = get_gimple_rhs_class (code);
4073   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
4074   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
4075
4076   /* FORNOW: support only if all uses are invariant. This means
4077      that the scalar operations can remain in place, unvectorized.
4078      The original last scalar value that they compute will be used.  */
4079
4080   for (i = 0; i < op_type; i++)
4081     {
4082       if (rhs_class == GIMPLE_SINGLE_RHS)
4083         op = TREE_OPERAND (gimple_op (stmt, 1), i);
4084       else
4085         op = gimple_op (stmt, i + 1);
4086       if (op
4087           && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
4088         {
4089           if (vect_print_dump_info (REPORT_DETAILS))
4090             fprintf (vect_dump, "use not simple.");
4091           return false;
4092         }
4093
4094       if (dt != vect_external_def && dt != vect_constant_def)
4095         return false;
4096     }
4097
4098   /* No transformation is required for the cases we currently support.  */
4099   return true;
4100 }
4101
4102 /* Function vect_transform_loop.
4103
4104    The analysis phase has determined that the loop is vectorizable.
4105    Vectorize the loop - created vectorized stmts to replace the scalar
4106    stmts in the loop, and update the loop exit condition.  */
4107
4108 void
4109 vect_transform_loop (loop_vec_info loop_vinfo)
4110 {
4111   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4112   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4113   int nbbs = loop->num_nodes;
4114   gimple_stmt_iterator si;
4115   int i;
4116   tree ratio = NULL;
4117   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4118   bool strided_store;
4119   bool slp_scheduled = false;
4120   unsigned int nunits;
4121   tree cond_expr = NULL_TREE;
4122   gimple_seq cond_expr_stmt_list = NULL;
4123   bool do_peeling_for_loop_bound;
4124
4125   if (vect_print_dump_info (REPORT_DETAILS))
4126     fprintf (vect_dump, "=== vec_transform_loop ===");
4127
4128   /* Peel the loop if there are data refs with unknown alignment.
4129      Only one data ref with unknown store is allowed.  */
4130
4131   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4132     vect_do_peeling_for_alignment (loop_vinfo);
4133
4134   do_peeling_for_loop_bound
4135     = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4136        || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4137            && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0));
4138
4139   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
4140       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4141     vect_loop_versioning (loop_vinfo,
4142                           !do_peeling_for_loop_bound,
4143                           &cond_expr, &cond_expr_stmt_list);
4144
4145   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4146      compile time constant), or it is a constant that doesn't divide by the
4147      vectorization factor, then an epilog loop needs to be created.
4148      We therefore duplicate the loop: the original loop will be vectorized,
4149      and will compute the first (n/VF) iterations. The second copy of the loop
4150      will remain scalar and will compute the remaining (n%VF) iterations.
4151      (VF is the vectorization factor).  */
4152
4153   if (do_peeling_for_loop_bound)
4154     vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
4155                                     cond_expr, cond_expr_stmt_list);
4156   else
4157     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
4158                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
4159
4160   /* 1) Make sure the loop header has exactly two entries
4161      2) Make sure we have a preheader basic block.  */
4162
4163   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
4164
4165   split_edge (loop_preheader_edge (loop));
4166
4167   /* FORNOW: the vectorizer supports only loops which body consist
4168      of one basic block (header + empty latch). When the vectorizer will
4169      support more involved loop forms, the order by which the BBs are
4170      traversed need to be reconsidered.  */
4171
4172   for (i = 0; i < nbbs; i++)
4173     {
4174       basic_block bb = bbs[i];
4175       stmt_vec_info stmt_info;
4176       gimple phi;
4177
4178       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
4179         {
4180           phi = gsi_stmt (si);
4181           if (vect_print_dump_info (REPORT_DETAILS))
4182             {
4183               fprintf (vect_dump, "------>vectorizing phi: ");
4184               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
4185             }
4186           stmt_info = vinfo_for_stmt (phi);
4187           if (!stmt_info)
4188             continue;
4189
4190           if (!STMT_VINFO_RELEVANT_P (stmt_info)
4191               && !STMT_VINFO_LIVE_P (stmt_info))
4192             continue;
4193
4194           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
4195                 != (unsigned HOST_WIDE_INT) vectorization_factor)
4196               && vect_print_dump_info (REPORT_DETAILS))
4197             fprintf (vect_dump, "multiple-types.");
4198
4199           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
4200             {
4201               if (vect_print_dump_info (REPORT_DETAILS))
4202                 fprintf (vect_dump, "transform phi.");
4203               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
4204             }
4205         }
4206
4207       for (si = gsi_start_bb (bb); !gsi_end_p (si);)
4208         {
4209           gimple stmt = gsi_stmt (si);
4210           bool is_store;
4211
4212           if (vect_print_dump_info (REPORT_DETAILS))
4213             {
4214               fprintf (vect_dump, "------>vectorizing statement: ");
4215               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
4216             }
4217
4218           stmt_info = vinfo_for_stmt (stmt);
4219
4220           /* vector stmts created in the outer-loop during vectorization of
4221              stmts in an inner-loop may not have a stmt_info, and do not
4222              need to be vectorized.  */
4223           if (!stmt_info)
4224             {
4225               gsi_next (&si);
4226               continue;
4227             }
4228
4229           if (!STMT_VINFO_RELEVANT_P (stmt_info)
4230               && !STMT_VINFO_LIVE_P (stmt_info))
4231             {
4232               gsi_next (&si);
4233               continue;
4234             }
4235
4236           gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
4237           nunits =
4238             (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4239           if (!STMT_SLP_TYPE (stmt_info)
4240               && nunits != (unsigned int) vectorization_factor
4241               && vect_print_dump_info (REPORT_DETAILS))
4242             /* For SLP VF is set according to unrolling factor, and not to
4243                vector size, hence for SLP this print is not valid.  */
4244             fprintf (vect_dump, "multiple-types.");
4245
4246           /* SLP. Schedule all the SLP instances when the first SLP stmt is
4247              reached.  */
4248           if (STMT_SLP_TYPE (stmt_info))
4249             {
4250               if (!slp_scheduled)
4251                 {
4252                   slp_scheduled = true;
4253
4254                   if (vect_print_dump_info (REPORT_DETAILS))
4255                     fprintf (vect_dump, "=== scheduling SLP instances ===");
4256
4257                   vect_schedule_slp (loop_vinfo, NULL);
4258                 }
4259
4260               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
4261               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
4262                 {
4263                   gsi_next (&si);
4264                   continue;
4265                 }
4266             }
4267
4268           /* -------- vectorize statement ------------ */
4269           if (vect_print_dump_info (REPORT_DETAILS))
4270             fprintf (vect_dump, "transform statement.");
4271
4272           strided_store = false;
4273           is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
4274           if (is_store)
4275             {
4276               if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4277                 {
4278                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
4279                      interleaving chain was completed - free all the stores in
4280                      the chain.  */
4281                   vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
4282                   gsi_remove (&si, true);
4283                   continue;
4284                 }
4285               else
4286                 {
4287                   /* Free the attached stmt_vec_info and remove the stmt.  */
4288                   free_stmt_vec_info (stmt);
4289                   gsi_remove (&si, true);
4290                   continue;
4291                 }
4292             }
4293           gsi_next (&si);
4294         }                       /* stmts in BB */
4295     }                           /* BBs in loop */
4296
4297   slpeel_make_loop_iterate_ntimes (loop, ratio);
4298
4299   /* The memory tags and pointers in vectorized statements need to
4300      have their SSA forms updated.  FIXME, why can't this be delayed
4301      until all the loops have been transformed?  */
4302   update_ssa (TODO_update_ssa);
4303
4304   if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4305     fprintf (vect_dump, "LOOP VECTORIZED.");
4306   if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4307     fprintf (vect_dump, "OUTER LOOP VECTORIZED.");
4308 }