gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2013 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "ggc.h"
  28 #include "tree.h"
  29 #include "stor-layout.h"
  30 #include "basic-block.h"
  31 #include "gimple-pretty-print.h"
  32 #include "gimple.h"
  33 #include "gimplify.h"
  34 #include "gimple-iterator.h"
  35 #include "gimplify-me.h"
  36 #include "gimple-ssa.h"
  37 #include "tree-phinodes.h"
  38 #include "ssa-iterators.h"
  39 #include "stringpool.h"
  40 #include "tree-ssanames.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-pass.h"
  45 #include "cfgloop.h"
  46 #include "expr.h"
  47 #include "recog.h"
  48 #include "optabs.h"
  49 #include "params.h"
  50 #include "diagnostic-core.h"
  51 #include "tree-chrec.h"
  52 #include "tree-scalar-evolution.h"
  53 #include "tree-vectorizer.h"
  54 #include "target.h"
  55
  56 /* Loop Vectorization Pass.
  57
  58    This pass tries to vectorize loops.
  59
  60    For example, the vectorizer transforms the following simple loop:
  61
  62         short a[N]; short b[N]; short c[N]; int i;
  63
  64         for (i=0; i<N; i++){
  65           a[i] = b[i] + c[i];
  66         }
  67
  68    as if it was manually vectorized by rewriting the source code into:
  69
  70         typedef int __attribute__((mode(V8HI))) v8hi;
  71         short a[N];  short b[N]; short c[N];   int i;
  72         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  73         v8hi va, vb, vc;
  74
  75         for (i=0; i<N/8; i++){
  76           vb = pb[i];
  77           vc = pc[i];
  78           va = vb + vc;
  79           pa[i] = va;
  80         }
  81
  82         The main entry to this pass is vectorize_loops(), in which
  83    the vectorizer applies a set of analyses on a given set of loops,
  84    followed by the actual vectorization transformation for the loops that
  85    had successfully passed the analysis phase.
  86         Throughout this pass we make a distinction between two types of
  87    data: scalars (which are represented by SSA_NAMES), and memory references
  88    ("data-refs").  These two types of data require different handling both
  89    during analysis and transformation. The types of data-refs that the
  90    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  91    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  92    accesses are required to have a simple (consecutive) access pattern.
  93
  94    Analysis phase:
  95    ===============
  96         The driver for the analysis phase is vect_analyze_loop().
  97    It applies a set of analyses, some of which rely on the scalar evolution
  98    analyzer (scev) developed by Sebastian Pop.
  99
 100         During the analysis phase the vectorizer records some information
 101    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 102    loop, as well as general information about the loop as a whole, which is
 103    recorded in a "loop_vec_info" struct attached to each loop.
 104
 105    Transformation phase:
 106    =====================
 107         The loop transformation phase scans all the stmts in the loop, and
 108    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 109    the loop that needs to be vectorized.  It inserts the vector code sequence
 110    just before the scalar stmt S, and records a pointer to the vector code
 111    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 112    attached to S).  This pointer will be used for the vectorization of following
 113    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 114    otherwise, we rely on dead code elimination for removing it.
 115
 116         For example, say stmt S1 was vectorized into stmt VS1:
 117
 118    VS1: vb = px[i];
 119    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 120    S2:  a = b;
 121
 122    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 123    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 124    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 125    resulting sequence would be:
 126
 127    VS1: vb = px[i];
 128    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 129    VS2: va = vb;
 130    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 131
 132         Operands that are not SSA_NAMEs, are data-refs that appear in
 133    load/store operations (like 'x[i]' in S1), and are handled differently.
 134
 135    Target modeling:
 136    =================
 137         Currently the only target specific information that is used is the
 138    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 139    Targets that can support different sizes of vectors, for now will need
 140    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 141    flexibility will be added in the future.
 142
 143         Since we only vectorize operations which vector form can be
 144    expressed using existing tree codes, to verify that an operation is
 145    supported, the vectorizer checks the relevant optab at the relevant
 146    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 147    the value found is CODE_FOR_nothing, then there's no target support, and
 148    we can't vectorize the stmt.
 149
 150    For additional information on this project see:
 151    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 152 */
 153
 154 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 155
 156 /* Function vect_determine_vectorization_factor
 157
 158    Determine the vectorization factor (VF).  VF is the number of data elements
 159    that are operated upon in parallel in a single iteration of the vectorized
 160    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 161    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 162    elements can fit in a single vector register.
 163
 164    We currently support vectorization of loops in which all types operated upon
 165    are of the same size.  Therefore this function currently sets VF according to
 166    the size of the types operated upon, and fails if there are multiple sizes
 167    in the loop.
 168
 169    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 170    original loop:
 171         for (i=0; i<N; i++){
 172           a[i] = b[i] + c[i];
 173         }
 174
 175    vectorized loop:
 176         for (i=0; i<N; i+=VF){
 177           a[i:VF] = b[i:VF] + c[i:VF];
 178         }
 179 */
 180
 181 static bool
 182 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 183 {
 184   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 185   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 186   int nbbs = loop->num_nodes;
 187   gimple_stmt_iterator si;
 188   unsigned int vectorization_factor = 0;
 189   tree scalar_type;
 190   gimple phi;
 191   tree vectype;
 192   unsigned int nunits;
 193   stmt_vec_info stmt_info;
 194   int i;
 195   HOST_WIDE_INT dummy;
 196   gimple stmt, pattern_stmt = NULL;
 197   gimple_seq pattern_def_seq = NULL;
 198   gimple_stmt_iterator pattern_def_si = gsi_none ();
 199   bool analyze_pattern_stmt = false;
 200
 201   if (dump_enabled_p ())
 202     dump_printf_loc (MSG_NOTE, vect_location,
 203                      "=== vect_determine_vectorization_factor ===\n");
 204
 205   for (i = 0; i < nbbs; i++)
 206     {
 207       basic_block bb = bbs[i];
 208
 209       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 210         {
 211           phi = gsi_stmt (si);
 212           stmt_info = vinfo_for_stmt (phi);
 213           if (dump_enabled_p ())
 214             {
 215               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 216               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 217               dump_printf (MSG_NOTE, "\n");
 218             }
 219
 220           gcc_assert (stmt_info);
 221
 222           if (STMT_VINFO_RELEVANT_P (stmt_info))
 223             {
 224               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 225               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 226
 227               if (dump_enabled_p ())
 228                 {
 229                   dump_printf_loc (MSG_NOTE, vect_location,
 230                                    "get vectype for scalar type:  ");
 231                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 232                   dump_printf (MSG_NOTE, "\n");
 233                 }
 234
 235               vectype = get_vectype_for_scalar_type (scalar_type);
 236               if (!vectype)
 237                 {
 238                   if (dump_enabled_p ())
 239                     {
 240                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 241                                        "not vectorized: unsupported "
 242                                        "data-type ");
 243                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 244                                          scalar_type);
 245                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 246                     }
 247                   return false;
 248                 }
 249               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 250
 251               if (dump_enabled_p ())
 252                 {
 253                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 254                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 255                   dump_printf (MSG_NOTE, "\n");
 256                 }
 257
 258               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 259               if (dump_enabled_p ())
 260                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 261                                  nunits);
 262
 263               if (!vectorization_factor
 264                   || (nunits > vectorization_factor))
 265                 vectorization_factor = nunits;
 266             }
 267         }
 268
 269       for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
 270         {
 271           tree vf_vectype;
 272
 273           if (analyze_pattern_stmt)
 274             stmt = pattern_stmt;
 275           else
 276             stmt = gsi_stmt (si);
 277
 278           stmt_info = vinfo_for_stmt (stmt);
 279
 280           if (dump_enabled_p ())
 281             {
 282               dump_printf_loc (MSG_NOTE, vect_location,
 283                                "==> examining statement: ");
 284               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 285               dump_printf (MSG_NOTE, "\n");
 286             }
 287
 288           gcc_assert (stmt_info);
 289
 290           /* Skip stmts which do not need to be vectorized.  */
 291           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 292                && !STMT_VINFO_LIVE_P (stmt_info))
 293               || gimple_clobber_p (stmt))
 294             {
 295               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 296                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 297                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 298                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 299                 {
 300                   stmt = pattern_stmt;
 301                   stmt_info = vinfo_for_stmt (pattern_stmt);
 302                   if (dump_enabled_p ())
 303                     {
 304                       dump_printf_loc (MSG_NOTE, vect_location,
 305                                        "==> examining pattern statement: ");
 306                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 307                       dump_printf (MSG_NOTE, "\n");
 308                     }
 309                 }
 310               else
 311                 {
 312                   if (dump_enabled_p ())
 313                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 314                   gsi_next (&si);
 315                   continue;
 316                 }
 317             }
 318           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 319                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 320                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 321                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 322             analyze_pattern_stmt = true;
 323
 324           /* If a pattern statement has def stmts, analyze them too.  */
 325           if (is_pattern_stmt_p (stmt_info))
 326             {
 327               if (pattern_def_seq == NULL)
 328                 {
 329                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 330                   pattern_def_si = gsi_start (pattern_def_seq);
 331                 }
 332               else if (!gsi_end_p (pattern_def_si))
 333                 gsi_next (&pattern_def_si);
 334               if (pattern_def_seq != NULL)
 335                 {
 336                   gimple pattern_def_stmt = NULL;
 337                   stmt_vec_info pattern_def_stmt_info = NULL;
 338
 339                   while (!gsi_end_p (pattern_def_si))
 340                     {
 341                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 342                       pattern_def_stmt_info
 343                         = vinfo_for_stmt (pattern_def_stmt);
 344                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 345                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 346                         break;
 347                       gsi_next (&pattern_def_si);
 348                     }
 349
 350                   if (!gsi_end_p (pattern_def_si))
 351                     {
 352                       if (dump_enabled_p ())
 353                         {
 354                           dump_printf_loc (MSG_NOTE, vect_location,
 355                                            "==> examining pattern def stmt: ");
 356                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 357                                             pattern_def_stmt, 0);
 358                           dump_printf (MSG_NOTE, "\n");
 359                         }
 360
 361                       stmt = pattern_def_stmt;
 362                       stmt_info = pattern_def_stmt_info;
 363                     }
 364                   else
 365                     {
 366                       pattern_def_si = gsi_none ();
 367                       analyze_pattern_stmt = false;
 368                     }
 369                 }
 370               else
 371                 analyze_pattern_stmt = false;
 372             }
 373
 374           if (gimple_get_lhs (stmt) == NULL_TREE)
 375             {
 376               if (dump_enabled_p ())
 377                 {
 378                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 379                                    "not vectorized: irregular stmt.");
 380                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 381                                     0);
 382                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 383                 }
 384               return false;
 385             }
 386
 387           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 388             {
 389               if (dump_enabled_p ())
 390                 {
 391                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 392                                    "not vectorized: vector stmt in loop:");
 393                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 394                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 395                 }
 396               return false;
 397             }
 398
 399           if (STMT_VINFO_VECTYPE (stmt_info))
 400             {
 401               /* The only case when a vectype had been already set is for stmts
 402                  that contain a dataref, or for "pattern-stmts" (stmts
 403                  generated by the vectorizer to represent/replace a certain
 404                  idiom).  */
 405               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 406                           || is_pattern_stmt_p (stmt_info)
 407                           || !gsi_end_p (pattern_def_si));
 408               vectype = STMT_VINFO_VECTYPE (stmt_info);
 409             }
 410           else
 411             {
 412               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 413               scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 414               if (dump_enabled_p ())
 415                 {
 416                   dump_printf_loc (MSG_NOTE, vect_location,
 417                                    "get vectype for scalar type:  ");
 418                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 419                   dump_printf (MSG_NOTE, "\n");
 420                 }
 421               vectype = get_vectype_for_scalar_type (scalar_type);
 422               if (!vectype)
 423                 {
 424                   if (dump_enabled_p ())
 425                     {
 426                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 427                                        "not vectorized: unsupported "
 428                                        "data-type ");
 429                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 430                                          scalar_type);
 431                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 432                     }
 433                   return false;
 434                 }
 435
 436               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 437
 438               if (dump_enabled_p ())
 439                 {
 440                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 441                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 442                   dump_printf (MSG_NOTE, "\n");
 443                 }
 444             }
 445
 446           /* The vectorization factor is according to the smallest
 447              scalar type (or the largest vector size, but we only
 448              support one vector size per loop).  */
 449           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 450                                                        &dummy);
 451           if (dump_enabled_p ())
 452             {
 453               dump_printf_loc (MSG_NOTE, vect_location,
 454                                "get vectype for scalar type:  ");
 455               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 456               dump_printf (MSG_NOTE, "\n");
 457             }
 458           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 459           if (!vf_vectype)
 460             {
 461               if (dump_enabled_p ())
 462                 {
 463                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 464                                    "not vectorized: unsupported data-type ");
 465                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 466                                      scalar_type);
 467                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 468                 }
 469               return false;
 470             }
 471
 472           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 473                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 474             {
 475               if (dump_enabled_p ())
 476                 {
 477                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 478                                    "not vectorized: different sized vector "
 479                                    "types in statement, ");
 480                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 481                                      vectype);
 482                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 483                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 484                                      vf_vectype);
 485                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 486                 }
 487               return false;
 488             }
 489
 490           if (dump_enabled_p ())
 491             {
 492               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 493               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 494               dump_printf (MSG_NOTE, "\n");
 495             }
 496
 497           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 498           if (dump_enabled_p ())
 499             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 500           if (!vectorization_factor
 501               || (nunits > vectorization_factor))
 502             vectorization_factor = nunits;
 503
 504           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 505             {
 506               pattern_def_seq = NULL;
 507               gsi_next (&si);
 508             }
 509         }
 510     }
 511
 512   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 513   if (dump_enabled_p ())
 514     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 515                      vectorization_factor);
 516   if (vectorization_factor <= 1)
 517     {
 518       if (dump_enabled_p ())
 519         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 520                          "not vectorized: unsupported data-type\n");
 521       return false;
 522     }
 523   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 524
 525   return true;
 526 }
 527
 528
 529 /* Function vect_is_simple_iv_evolution.
 530
 531    FORNOW: A simple evolution of an induction variables in the loop is
 532    considered a polynomial evolution.  */
 533
 534 static bool
 535 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 536                              tree * step)
 537 {
 538   tree init_expr;
 539   tree step_expr;
 540   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 541   basic_block bb;
 542
 543   /* When there is no evolution in this loop, the evolution function
 544      is not "simple".  */
 545   if (evolution_part == NULL_TREE)
 546     return false;
 547
 548   /* When the evolution is a polynomial of degree >= 2
 549      the evolution function is not "simple".  */
 550   if (tree_is_chrec (evolution_part))
 551     return false;
 552
 553   step_expr = evolution_part;
 554   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 555
 556   if (dump_enabled_p ())
 557     {
 558       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 559       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 560       dump_printf (MSG_NOTE, ",  init: ");
 561       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 562       dump_printf (MSG_NOTE, "\n");
 563     }
 564
 565   *init = init_expr;
 566   *step = step_expr;
 567
 568   if (TREE_CODE (step_expr) != INTEGER_CST
 569       && (TREE_CODE (step_expr) != SSA_NAME
 570           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 571               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 572           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 573               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 574                   || !flag_associative_math)))
 575       && (TREE_CODE (step_expr) != REAL_CST
 576           || !flag_associative_math))
 577     {
 578       if (dump_enabled_p ())
 579         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 580                          "step unknown.\n");
 581       return false;
 582     }
 583
 584   return true;
 585 }
 586
 587 /* Function vect_analyze_scalar_cycles_1.
 588
 589    Examine the cross iteration def-use cycles of scalar variables
 590    in LOOP.  LOOP_VINFO represents the loop that is now being
 591    considered for vectorization (can be LOOP, or an outer-loop
 592    enclosing LOOP).  */
 593
 594 static void
 595 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 596 {
 597   basic_block bb = loop->header;
 598   tree init, step;
 599   stack_vec<gimple, 64> worklist;
 600   gimple_stmt_iterator gsi;
 601   bool double_reduc;
 602
 603   if (dump_enabled_p ())
 604     dump_printf_loc (MSG_NOTE, vect_location,
 605                      "=== vect_analyze_scalar_cycles ===\n");
 606
 607   /* First - identify all inductions.  Reduction detection assumes that all the
 608      inductions have been identified, therefore, this order must not be
 609      changed.  */
 610   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 611     {
 612       gimple phi = gsi_stmt (gsi);
 613       tree access_fn = NULL;
 614       tree def = PHI_RESULT (phi);
 615       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 616
 617       if (dump_enabled_p ())
 618         {
 619           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 620           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 621           dump_printf (MSG_NOTE, "\n");
 622         }
 623
 624       /* Skip virtual phi's.  The data dependences that are associated with
 625          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 626       if (virtual_operand_p (def))
 627         continue;
 628
 629       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 630
 631       /* Analyze the evolution function.  */
 632       access_fn = analyze_scalar_evolution (loop, def);
 633       if (access_fn)
 634         {
 635           STRIP_NOPS (access_fn);
 636           if (dump_enabled_p ())
 637             {
 638               dump_printf_loc (MSG_NOTE, vect_location,
 639                                "Access function of PHI: ");
 640               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 641               dump_printf (MSG_NOTE, "\n");
 642             }
 643           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 644             = evolution_part_in_loop_num (access_fn, loop->num);
 645         }
 646
 647       if (!access_fn
 648           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 649           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 650               && TREE_CODE (step) != INTEGER_CST))
 651         {
 652           worklist.safe_push (phi);
 653           continue;
 654         }
 655
 656       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 657
 658       if (dump_enabled_p ())
 659         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 660       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 661     }
 662
 663
 664   /* Second - identify all reductions and nested cycles.  */
 665   while (worklist.length () > 0)
 666     {
 667       gimple phi = worklist.pop ();
 668       tree def = PHI_RESULT (phi);
 669       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 670       gimple reduc_stmt;
 671       bool nested_cycle;
 672
 673       if (dump_enabled_p ())
 674         {
 675           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 676           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 677           dump_printf (MSG_NOTE, "\n");
 678         }
 679
 680       gcc_assert (!virtual_operand_p (def)
 681                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 682
 683       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 684       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 685                                                 &double_reduc);
 686       if (reduc_stmt)
 687         {
 688           if (double_reduc)
 689             {
 690               if (dump_enabled_p ())
 691                 dump_printf_loc (MSG_NOTE, vect_location,
 692                                  "Detected double reduction.\n");
 693
 694               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 695               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 696                                                     vect_double_reduction_def;
 697             }
 698           else
 699             {
 700               if (nested_cycle)
 701                 {
 702                   if (dump_enabled_p ())
 703                     dump_printf_loc (MSG_NOTE, vect_location,
 704                                      "Detected vectorizable nested cycle.\n");
 705
 706                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 707                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 708                                                              vect_nested_cycle;
 709                 }
 710               else
 711                 {
 712                   if (dump_enabled_p ())
 713                     dump_printf_loc (MSG_NOTE, vect_location,
 714                                      "Detected reduction.\n");
 715
 716                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 717                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 718                                                            vect_reduction_def;
 719                   /* Store the reduction cycles for possible vectorization in
 720                      loop-aware SLP.  */
 721                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 722                 }
 723             }
 724         }
 725       else
 726         if (dump_enabled_p ())
 727           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 728                            "Unknown def-use cycle pattern.\n");
 729     }
 730 }
 731
 732
 733 /* Function vect_analyze_scalar_cycles.
 734
 735    Examine the cross iteration def-use cycles of scalar variables, by
 736    analyzing the loop-header PHIs of scalar variables.  Classify each
 737    cycle as one of the following: invariant, induction, reduction, unknown.
 738    We do that for the loop represented by LOOP_VINFO, and also to its
 739    inner-loop, if exists.
 740    Examples for scalar cycles:
 741
 742    Example1: reduction:
 743
 744               loop1:
 745               for (i=0; i<N; i++)
 746                  sum += a[i];
 747
 748    Example2: induction:
 749
 750               loop2:
 751               for (i=0; i<N; i++)
 752                  a[i] = i;  */
 753
 754 static void
 755 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 756 {
 757   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 758
 759   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 760
 761   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 762      Reductions in such inner-loop therefore have different properties than
 763      the reductions in the nest that gets vectorized:
 764      1. When vectorized, they are executed in the same order as in the original
 765         scalar loop, so we can't change the order of computation when
 766         vectorizing them.
 767      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 768         current checks are too strict.  */
 769
 770   if (loop->inner)
 771     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 772 }
 773
 774 /* Function vect_get_loop_niters.
 775
 776    Determine how many iterations the loop is executed.
 777    If an expression that represents the number of iterations
 778    can be constructed, place it in NUMBER_OF_ITERATIONS.
 779    Return the loop exit condition.  */
 780
 781 static gimple
 782 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 783 {
 784   tree niters;
 785
 786   if (dump_enabled_p ())
 787     dump_printf_loc (MSG_NOTE, vect_location,
 788                      "=== get_loop_niters ===\n");
 789   niters = number_of_exit_cond_executions (loop);
 790
 791   if (niters != NULL_TREE
 792       && niters != chrec_dont_know)
 793     {
 794       *number_of_iterations = niters;
 795
 796       if (dump_enabled_p ())
 797         {
 798           dump_printf_loc (MSG_NOTE, vect_location, "==> get_loop_niters:");
 799           dump_generic_expr (MSG_NOTE, TDF_SLIM, *number_of_iterations);
 800           dump_printf (MSG_NOTE, "\n");
 801         }
 802     }
 803
 804   return get_loop_exit_condition (loop);
 805 }
 806
 807
 808 /* Function bb_in_loop_p
 809
 810    Used as predicate for dfs order traversal of the loop bbs.  */
 811
 812 static bool
 813 bb_in_loop_p (const_basic_block bb, const void *data)
 814 {
 815   const struct loop *const loop = (const struct loop *)data;
 816   if (flow_bb_inside_loop_p (loop, bb))
 817     return true;
 818   return false;
 819 }
 820
 821
 822 /* Function new_loop_vec_info.
 823
 824    Create and initialize a new loop_vec_info struct for LOOP, as well as
 825    stmt_vec_info structs for all the stmts in LOOP.  */
 826
 827 static loop_vec_info
 828 new_loop_vec_info (struct loop *loop)
 829 {
 830   loop_vec_info res;
 831   basic_block *bbs;
 832   gimple_stmt_iterator si;
 833   unsigned int i, nbbs;
 834
 835   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 836   LOOP_VINFO_LOOP (res) = loop;
 837
 838   bbs = get_loop_body (loop);
 839
 840   /* Create/Update stmt_info for all stmts in the loop.  */
 841   for (i = 0; i < loop->num_nodes; i++)
 842     {
 843       basic_block bb = bbs[i];
 844
 845       /* BBs in a nested inner-loop will have been already processed (because
 846          we will have called vect_analyze_loop_form for any nested inner-loop).
 847          Therefore, for stmts in an inner-loop we just want to update the
 848          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 849          loop_info of the outer-loop we are currently considering to vectorize
 850          (instead of the loop_info of the inner-loop).
 851          For stmts in other BBs we need to create a stmt_info from scratch.  */
 852       if (bb->loop_father != loop)
 853         {
 854           /* Inner-loop bb.  */
 855           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 856           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 857             {
 858               gimple phi = gsi_stmt (si);
 859               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 860               loop_vec_info inner_loop_vinfo =
 861                 STMT_VINFO_LOOP_VINFO (stmt_info);
 862               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 863               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 864             }
 865           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 866            {
 867               gimple stmt = gsi_stmt (si);
 868               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 869               loop_vec_info inner_loop_vinfo =
 870                  STMT_VINFO_LOOP_VINFO (stmt_info);
 871               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 872               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 873            }
 874         }
 875       else
 876         {
 877           /* bb in current nest.  */
 878           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 879             {
 880               gimple phi = gsi_stmt (si);
 881               gimple_set_uid (phi, 0);
 882               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 883             }
 884
 885           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 886             {
 887               gimple stmt = gsi_stmt (si);
 888               gimple_set_uid (stmt, 0);
 889               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 890             }
 891         }
 892     }
 893
 894   /* CHECKME: We want to visit all BBs before their successors (except for
 895      latch blocks, for which this assertion wouldn't hold).  In the simple
 896      case of the loop forms we allow, a dfs order of the BBs would the same
 897      as reversed postorder traversal, so we are safe.  */
 898
 899    free (bbs);
 900    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 901    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 902                               bbs, loop->num_nodes, loop);
 903    gcc_assert (nbbs == loop->num_nodes);
 904
 905   LOOP_VINFO_BBS (res) = bbs;
 906   LOOP_VINFO_NITERS (res) = NULL;
 907   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 908   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 909   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 910   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
 911   LOOP_VINFO_VECT_FACTOR (res) = 0;
 912   LOOP_VINFO_LOOP_NEST (res).create (3);
 913   LOOP_VINFO_DATAREFS (res).create (10);
 914   LOOP_VINFO_DDRS (res).create (10 * 10);
 915   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 916   LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
 917              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 918   LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
 919              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 920   LOOP_VINFO_GROUPED_STORES (res).create (10);
 921   LOOP_VINFO_REDUCTIONS (res).create (10);
 922   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
 923   LOOP_VINFO_SLP_INSTANCES (res).create (10);
 924   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 925   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
 926   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 927   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
 928
 929   return res;
 930 }
 931
 932
 933 /* Function destroy_loop_vec_info.
 934
 935    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 936    stmts in the loop.  */
 937
 938 void
 939 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 940 {
 941   struct loop *loop;
 942   basic_block *bbs;
 943   int nbbs;
 944   gimple_stmt_iterator si;
 945   int j;
 946   vec<slp_instance> slp_instances;
 947   slp_instance instance;
 948   bool swapped;
 949
 950   if (!loop_vinfo)
 951     return;
 952
 953   loop = LOOP_VINFO_LOOP (loop_vinfo);
 954
 955   bbs = LOOP_VINFO_BBS (loop_vinfo);
 956   nbbs = clean_stmts ? loop->num_nodes : 0;
 957   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
 958
 959   for (j = 0; j < nbbs; j++)
 960     {
 961       basic_block bb = bbs[j];
 962       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 963         free_stmt_vec_info (gsi_stmt (si));
 964
 965       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 966         {
 967           gimple stmt = gsi_stmt (si);
 968
 969           /* We may have broken canonical form by moving a constant
 970              into RHS1 of a commutative op.  Fix such occurrences.  */
 971           if (swapped && is_gimple_assign (stmt))
 972             {
 973               enum tree_code code = gimple_assign_rhs_code (stmt);
 974
 975               if ((code == PLUS_EXPR
 976                    || code == POINTER_PLUS_EXPR
 977                    || code == MULT_EXPR)
 978                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 979                 swap_ssa_operands (stmt,
 980                                    gimple_assign_rhs1_ptr (stmt),
 981                                    gimple_assign_rhs2_ptr (stmt));
 982             }
 983
 984           /* Free stmt_vec_info.  */
 985           free_stmt_vec_info (stmt);
 986           gsi_next (&si);
 987         }
 988     }
 989
 990   free (LOOP_VINFO_BBS (loop_vinfo));
 991   vect_destroy_datarefs (loop_vinfo, NULL);
 992   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 993   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
 994   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
 995   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
 996   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
 997   FOR_EACH_VEC_ELT (slp_instances, j, instance)
 998     vect_free_slp_instance (instance);
 999
1000   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1001   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1002   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1003   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1004
1005   if (LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
1006     LOOP_VINFO_PEELING_HTAB (loop_vinfo).dispose ();
1007
1008   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1009
1010   free (loop_vinfo);
1011   loop->aux = NULL;
1012 }
1013
1014
1015 /* Function vect_analyze_loop_1.
1016
1017    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1018    for it. The different analyses will record information in the
1019    loop_vec_info struct.  This is a subset of the analyses applied in
1020    vect_analyze_loop, to be applied on an inner-loop nested in the loop
1021    that is now considered for (outer-loop) vectorization.  */
1022
1023 static loop_vec_info
1024 vect_analyze_loop_1 (struct loop *loop)
1025 {
1026   loop_vec_info loop_vinfo;
1027
1028   if (dump_enabled_p ())
1029     dump_printf_loc (MSG_NOTE, vect_location,
1030                      "===== analyze_loop_nest_1 =====\n");
1031
1032   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1033
1034   loop_vinfo = vect_analyze_loop_form (loop);
1035   if (!loop_vinfo)
1036     {
1037       if (dump_enabled_p ())
1038         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1039                          "bad inner-loop form.\n");
1040       return NULL;
1041     }
1042
1043   return loop_vinfo;
1044 }
1045
1046
1047 /* Function vect_analyze_loop_form.
1048
1049    Verify that certain CFG restrictions hold, including:
1050    - the loop has a pre-header
1051    - the loop has a single entry and exit
1052    - the loop exit condition is simple enough, and the number of iterations
1053      can be analyzed (a countable loop).  */
1054
1055 loop_vec_info
1056 vect_analyze_loop_form (struct loop *loop)
1057 {
1058   loop_vec_info loop_vinfo;
1059   gimple loop_cond;
1060   tree number_of_iterations = NULL;
1061   loop_vec_info inner_loop_vinfo = NULL;
1062
1063   if (dump_enabled_p ())
1064     dump_printf_loc (MSG_NOTE, vect_location,
1065                      "=== vect_analyze_loop_form ===\n");
1066
1067   /* Different restrictions apply when we are considering an inner-most loop,
1068      vs. an outer (nested) loop.
1069      (FORNOW. May want to relax some of these restrictions in the future).  */
1070
1071   if (!loop->inner)
1072     {
1073       /* Inner-most loop.  We currently require that the number of BBs is
1074          exactly 2 (the header and latch).  Vectorizable inner-most loops
1075          look like this:
1076
1077                         (pre-header)
1078                            |
1079                           header <--------+
1080                            | |            |
1081                            | +--> latch --+
1082                            |
1083                         (exit-bb)  */
1084
1085       if (loop->num_nodes != 2)
1086         {
1087           if (dump_enabled_p ())
1088             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1089                              "not vectorized: control flow in loop.\n");
1090           return NULL;
1091         }
1092
1093       if (empty_block_p (loop->header))
1094     {
1095           if (dump_enabled_p ())
1096             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1097                              "not vectorized: empty loop.\n");
1098       return NULL;
1099     }
1100     }
1101   else
1102     {
1103       struct loop *innerloop = loop->inner;
1104       edge entryedge;
1105
1106       /* Nested loop. We currently require that the loop is doubly-nested,
1107          contains a single inner loop, and the number of BBs is exactly 5.
1108          Vectorizable outer-loops look like this:
1109
1110                         (pre-header)
1111                            |
1112                           header <---+
1113                            |         |
1114                           inner-loop |
1115                            |         |
1116                           tail ------+
1117                            |
1118                         (exit-bb)
1119
1120          The inner-loop has the properties expected of inner-most loops
1121          as described above.  */
1122
1123       if ((loop->inner)->inner || (loop->inner)->next)
1124         {
1125           if (dump_enabled_p ())
1126             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1127                              "not vectorized: multiple nested loops.\n");
1128           return NULL;
1129         }
1130
1131       /* Analyze the inner-loop.  */
1132       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1133       if (!inner_loop_vinfo)
1134         {
1135           if (dump_enabled_p ())
1136             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1137                              "not vectorized: Bad inner loop.\n");
1138           return NULL;
1139         }
1140
1141       if (!expr_invariant_in_loop_p (loop,
1142                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1143         {
1144           if (dump_enabled_p ())
1145             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1146                              "not vectorized: inner-loop count not"
1147                              " invariant.\n");
1148           destroy_loop_vec_info (inner_loop_vinfo, true);
1149           return NULL;
1150         }
1151
1152       if (loop->num_nodes != 5)
1153         {
1154           if (dump_enabled_p ())
1155             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1156                              "not vectorized: control flow in loop.\n");
1157           destroy_loop_vec_info (inner_loop_vinfo, true);
1158           return NULL;
1159         }
1160
1161       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1162       entryedge = EDGE_PRED (innerloop->header, 0);
1163       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1164         entryedge = EDGE_PRED (innerloop->header, 1);
1165
1166       if (entryedge->src != loop->header
1167           || !single_exit (innerloop)
1168           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1169         {
1170           if (dump_enabled_p ())
1171             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1172                              "not vectorized: unsupported outerloop form.\n");
1173           destroy_loop_vec_info (inner_loop_vinfo, true);
1174           return NULL;
1175         }
1176
1177       if (dump_enabled_p ())
1178         dump_printf_loc (MSG_NOTE, vect_location,
1179                          "Considering outer-loop vectorization.\n");
1180     }
1181
1182   if (!single_exit (loop)
1183       || EDGE_COUNT (loop->header->preds) != 2)
1184     {
1185       if (dump_enabled_p ())
1186         {
1187           if (!single_exit (loop))
1188             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1189                              "not vectorized: multiple exits.\n");
1190           else if (EDGE_COUNT (loop->header->preds) != 2)
1191             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1192                              "not vectorized: too many incoming edges.\n");
1193         }
1194       if (inner_loop_vinfo)
1195         destroy_loop_vec_info (inner_loop_vinfo, true);
1196       return NULL;
1197     }
1198
1199   /* We assume that the loop exit condition is at the end of the loop. i.e,
1200      that the loop is represented as a do-while (with a proper if-guard
1201      before the loop if needed), where the loop header contains all the
1202      executable statements, and the latch is empty.  */
1203   if (!empty_block_p (loop->latch)
1204       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1205     {
1206       if (dump_enabled_p ())
1207         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1208                          "not vectorized: latch block not empty.\n");
1209       if (inner_loop_vinfo)
1210         destroy_loop_vec_info (inner_loop_vinfo, true);
1211       return NULL;
1212     }
1213
1214   /* Make sure there exists a single-predecessor exit bb:  */
1215   if (!single_pred_p (single_exit (loop)->dest))
1216     {
1217       edge e = single_exit (loop);
1218       if (!(e->flags & EDGE_ABNORMAL))
1219         {
1220           split_loop_exit_edge (e);
1221           if (dump_enabled_p ())
1222             dump_printf (MSG_NOTE, "split exit edge.\n");
1223         }
1224       else
1225         {
1226           if (dump_enabled_p ())
1227             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1228                              "not vectorized: abnormal loop exit edge.\n");
1229           if (inner_loop_vinfo)
1230             destroy_loop_vec_info (inner_loop_vinfo, true);
1231           return NULL;
1232         }
1233     }
1234
1235   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1236   if (!loop_cond)
1237     {
1238       if (dump_enabled_p ())
1239         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1240                          "not vectorized: complicated exit condition.\n");
1241       if (inner_loop_vinfo)
1242         destroy_loop_vec_info (inner_loop_vinfo, true);
1243       return NULL;
1244     }
1245
1246   if (!number_of_iterations)
1247     {
1248       if (dump_enabled_p ())
1249         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1250                          "not vectorized: number of iterations cannot be "
1251                          "computed.\n");
1252       if (inner_loop_vinfo)
1253         destroy_loop_vec_info (inner_loop_vinfo, true);
1254       return NULL;
1255     }
1256
1257   if (chrec_contains_undetermined (number_of_iterations))
1258     {
1259       if (dump_enabled_p ())
1260             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1261                              "Infinite number of iterations.\n");
1262       if (inner_loop_vinfo)
1263         destroy_loop_vec_info (inner_loop_vinfo, true);
1264       return NULL;
1265     }
1266
1267   if (!NITERS_KNOWN_P (number_of_iterations))
1268     {
1269       if (dump_enabled_p ())
1270         {
1271           dump_printf_loc (MSG_NOTE, vect_location,
1272                            "Symbolic number of iterations is ");
1273           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1274           dump_printf (MSG_NOTE, "\n");
1275         }
1276     }
1277   else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1278     {
1279       if (dump_enabled_p ())
1280         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1281                          "not vectorized: number of iterations = 0.\n");
1282       if (inner_loop_vinfo)
1283         destroy_loop_vec_info (inner_loop_vinfo, true);
1284       return NULL;
1285     }
1286
1287   loop_vinfo = new_loop_vec_info (loop);
1288   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1289   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1290
1291   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1292
1293   /* CHECKME: May want to keep it around it in the future.  */
1294   if (inner_loop_vinfo)
1295     destroy_loop_vec_info (inner_loop_vinfo, false);
1296
1297   gcc_assert (!loop->aux);
1298   loop->aux = loop_vinfo;
1299   return loop_vinfo;
1300 }
1301
1302
1303 /* Function vect_analyze_loop_operations.
1304
1305    Scan the loop stmts and make sure they are all vectorizable.  */
1306
1307 static bool
1308 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1309 {
1310   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1311   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1312   int nbbs = loop->num_nodes;
1313   gimple_stmt_iterator si;
1314   unsigned int vectorization_factor = 0;
1315   int i;
1316   gimple phi;
1317   stmt_vec_info stmt_info;
1318   bool need_to_vectorize = false;
1319   int min_profitable_iters;
1320   int min_scalar_loop_bound;
1321   unsigned int th;
1322   bool only_slp_in_loop = true, ok;
1323   HOST_WIDE_INT max_niter;
1324   HOST_WIDE_INT estimated_niter;
1325   int min_profitable_estimate;
1326
1327   if (dump_enabled_p ())
1328     dump_printf_loc (MSG_NOTE, vect_location,
1329                      "=== vect_analyze_loop_operations ===\n");
1330
1331   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1332   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1333   if (slp)
1334     {
1335       /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1336          vectorization factor of the loop is the unrolling factor required by
1337          the SLP instances.  If that unrolling factor is 1, we say, that we
1338          perform pure SLP on loop - cross iteration parallelism is not
1339          exploited.  */
1340       for (i = 0; i < nbbs; i++)
1341         {
1342           basic_block bb = bbs[i];
1343           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1344             {
1345               gimple stmt = gsi_stmt (si);
1346               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1347               gcc_assert (stmt_info);
1348               if ((STMT_VINFO_RELEVANT_P (stmt_info)
1349                    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1350                   && !PURE_SLP_STMT (stmt_info))
1351                 /* STMT needs both SLP and loop-based vectorization.  */
1352                 only_slp_in_loop = false;
1353             }
1354         }
1355
1356       if (only_slp_in_loop)
1357         vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1358       else
1359         vectorization_factor = least_common_multiple (vectorization_factor,
1360                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1361
1362       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1363       if (dump_enabled_p ())
1364         dump_printf_loc (MSG_NOTE, vect_location,
1365                          "Updating vectorization factor to %d\n",
1366                          vectorization_factor);
1367     }
1368
1369   for (i = 0; i < nbbs; i++)
1370     {
1371       basic_block bb = bbs[i];
1372
1373       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1374         {
1375           phi = gsi_stmt (si);
1376           ok = true;
1377
1378           stmt_info = vinfo_for_stmt (phi);
1379           if (dump_enabled_p ())
1380             {
1381               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1382               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1383               dump_printf (MSG_NOTE, "\n");
1384             }
1385
1386           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1387              (i.e., a phi in the tail of the outer-loop).  */
1388           if (! is_loop_header_bb_p (bb))
1389             {
1390               /* FORNOW: we currently don't support the case that these phis
1391                  are not used in the outerloop (unless it is double reduction,
1392                  i.e., this phi is vect_reduction_def), cause this case
1393                  requires to actually do something here.  */
1394               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1395                    || STMT_VINFO_LIVE_P (stmt_info))
1396                   && STMT_VINFO_DEF_TYPE (stmt_info)
1397                      != vect_double_reduction_def)
1398                 {
1399                   if (dump_enabled_p ())
1400                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1401                                      "Unsupported loop-closed phi in "
1402                                      "outer-loop.\n");
1403                   return false;
1404                 }
1405
1406               /* If PHI is used in the outer loop, we check that its operand
1407                  is defined in the inner loop.  */
1408               if (STMT_VINFO_RELEVANT_P (stmt_info))
1409                 {
1410                   tree phi_op;
1411                   gimple op_def_stmt;
1412
1413                   if (gimple_phi_num_args (phi) != 1)
1414                     return false;
1415
1416                   phi_op = PHI_ARG_DEF (phi, 0);
1417                   if (TREE_CODE (phi_op) != SSA_NAME)
1418                     return false;
1419
1420                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1421                   if (gimple_nop_p (op_def_stmt)
1422                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1423                       || !vinfo_for_stmt (op_def_stmt))
1424                     return false;
1425
1426                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1427                         != vect_used_in_outer
1428                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1429                            != vect_used_in_outer_by_reduction)
1430                     return false;
1431                 }
1432
1433               continue;
1434             }
1435
1436           gcc_assert (stmt_info);
1437
1438           if (STMT_VINFO_LIVE_P (stmt_info))
1439             {
1440               /* FORNOW: not yet supported.  */
1441               if (dump_enabled_p ())
1442                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1443                                  "not vectorized: value used after loop.\n");
1444               return false;
1445             }
1446
1447           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1448               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1449             {
1450               /* A scalar-dependence cycle that we don't support.  */
1451               if (dump_enabled_p ())
1452                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1453                                  "not vectorized: scalar dependence cycle.\n");
1454               return false;
1455             }
1456
1457           if (STMT_VINFO_RELEVANT_P (stmt_info))
1458             {
1459               need_to_vectorize = true;
1460               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1461                 ok = vectorizable_induction (phi, NULL, NULL);
1462             }
1463
1464           if (!ok)
1465             {
1466               if (dump_enabled_p ())
1467                 {
1468                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1469                                    "not vectorized: relevant phi not "
1470                                    "supported: ");
1471                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1472                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1473                 }
1474               return false;
1475             }
1476         }
1477
1478       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1479         {
1480           gimple stmt = gsi_stmt (si);
1481           if (!gimple_clobber_p (stmt)
1482               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1483             return false;
1484         }
1485     } /* bbs */
1486
1487   /* All operations in the loop are either irrelevant (deal with loop
1488      control, or dead), or only used outside the loop and can be moved
1489      out of the loop (e.g. invariants, inductions).  The loop can be
1490      optimized away by scalar optimizations.  We're better off not
1491      touching this loop.  */
1492   if (!need_to_vectorize)
1493     {
1494       if (dump_enabled_p ())
1495         dump_printf_loc (MSG_NOTE, vect_location,
1496                          "All the computation can be taken out of the loop.\n");
1497       if (dump_enabled_p ())
1498         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1499                          "not vectorized: redundant loop. no profit to "
1500                          "vectorize.\n");
1501       return false;
1502     }
1503
1504   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1505     dump_printf_loc (MSG_NOTE, vect_location,
1506                      "vectorization_factor = %d, niters = "
1507                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1508                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1509
1510   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1511        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1512       || ((max_niter = max_stmt_executions_int (loop)) != -1
1513           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1514     {
1515       if (dump_enabled_p ())
1516         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1517                          "not vectorized: iteration count too small.\n");
1518       if (dump_enabled_p ())
1519         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1520                          "not vectorized: iteration count smaller than "
1521                          "vectorization factor.\n");
1522       return false;
1523     }
1524
1525   /* Analyze cost.  Decide if worth while to vectorize.  */
1526
1527   /* Once VF is set, SLP costs should be updated since the number of created
1528      vector stmts depends on VF.  */
1529   vect_update_slp_costs_according_to_vf (loop_vinfo);
1530
1531   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1532                                       &min_profitable_estimate);
1533   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1534
1535   if (min_profitable_iters < 0)
1536     {
1537       if (dump_enabled_p ())
1538         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1539                          "not vectorized: vectorization not profitable.\n");
1540       if (dump_enabled_p ())
1541         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1542                          "not vectorized: vector version will never be "
1543                          "profitable.\n");
1544       return false;
1545     }
1546
1547   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1548                             * vectorization_factor) - 1);
1549
1550
1551   /* Use the cost model only if it is more conservative than user specified
1552      threshold.  */
1553
1554   th = (unsigned) min_scalar_loop_bound;
1555   if (min_profitable_iters
1556       && (!min_scalar_loop_bound
1557           || min_profitable_iters > min_scalar_loop_bound))
1558     th = (unsigned) min_profitable_iters;
1559
1560   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1561       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1562     {
1563       if (dump_enabled_p ())
1564         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1565                          "not vectorized: vectorization not profitable.\n");
1566       if (dump_enabled_p ())
1567         dump_printf_loc (MSG_NOTE, vect_location,
1568                          "not vectorized: iteration count smaller than user "
1569                          "specified loop bound parameter or minimum profitable "
1570                          "iterations (whichever is more conservative).\n");
1571       return false;
1572     }
1573
1574   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1575       && ((unsigned HOST_WIDE_INT) estimated_niter
1576           <= MAX (th, (unsigned)min_profitable_estimate)))
1577     {
1578       if (dump_enabled_p ())
1579         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1580                          "not vectorized: estimated iteration count too "
1581                          "small.\n");
1582       if (dump_enabled_p ())
1583         dump_printf_loc (MSG_NOTE, vect_location,
1584                          "not vectorized: estimated iteration count smaller "
1585                          "than specified loop bound parameter or minimum "
1586                          "profitable iterations (whichever is more "
1587                          "conservative).\n");
1588       return false;
1589     }
1590
1591   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1592       || ((int) tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1593           < exact_log2 (vectorization_factor)))
1594     {
1595       if (dump_enabled_p ())
1596         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1597       if (!vect_can_advance_ivs_p (loop_vinfo)
1598           || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1599         {
1600           if (dump_enabled_p ())
1601             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1602                              "not vectorized: can't create required "
1603                              "epilog loop\n");
1604           return false;
1605         }
1606     }
1607
1608   return true;
1609 }
1610
1611
1612 /* Function vect_analyze_loop_2.
1613
1614    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1615    for it.  The different analyses will record information in the
1616    loop_vec_info struct.  */
1617 static bool
1618 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1619 {
1620   bool ok, slp = false;
1621   int max_vf = MAX_VECTORIZATION_FACTOR;
1622   int min_vf = 2;
1623
1624   /* Find all data references in the loop (which correspond to vdefs/vuses)
1625      and analyze their evolution in the loop.  Also adjust the minimal
1626      vectorization factor according to the loads and stores.
1627
1628      FORNOW: Handle only simple, array references, which
1629      alignment can be forced, and aligned pointer-references.  */
1630
1631   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1632   if (!ok)
1633     {
1634       if (dump_enabled_p ())
1635         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1636                          "bad data references.\n");
1637       return false;
1638     }
1639
1640   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1641      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1642
1643   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1644   if (!ok)
1645     {
1646       if (dump_enabled_p ())
1647         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1648                          "bad data access.\n");
1649       return false;
1650     }
1651
1652   /* Classify all cross-iteration scalar data-flow cycles.
1653      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1654
1655   vect_analyze_scalar_cycles (loop_vinfo);
1656
1657   vect_pattern_recog (loop_vinfo, NULL);
1658
1659   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1660
1661   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1662   if (!ok)
1663     {
1664       if (dump_enabled_p ())
1665         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1666                          "unexpected pattern.\n");
1667       return false;
1668     }
1669
1670   /* Analyze data dependences between the data-refs in the loop
1671      and adjust the maximum vectorization factor according to
1672      the dependences.
1673      FORNOW: fail at the first data dependence that we encounter.  */
1674
1675   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1676   if (!ok
1677       || max_vf < min_vf)
1678     {
1679       if (dump_enabled_p ())
1680             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1681                              "bad data dependence.\n");
1682       return false;
1683     }
1684
1685   ok = vect_determine_vectorization_factor (loop_vinfo);
1686   if (!ok)
1687     {
1688       if (dump_enabled_p ())
1689         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1690                          "can't determine vectorization factor.\n");
1691       return false;
1692     }
1693   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1694     {
1695       if (dump_enabled_p ())
1696         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1697                          "bad data dependence.\n");
1698       return false;
1699     }
1700
1701   /* Analyze the alignment of the data-refs in the loop.
1702      Fail if a data reference is found that cannot be vectorized.  */
1703
1704   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1705   if (!ok)
1706     {
1707       if (dump_enabled_p ())
1708         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1709                          "bad data alignment.\n");
1710       return false;
1711     }
1712
1713   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1714      It is important to call pruning after vect_analyze_data_ref_accesses,
1715      since we use grouping information gathered by interleaving analysis.  */
1716   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1717   if (!ok)
1718     {
1719       if (dump_enabled_p ())
1720         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1721                          "too long list of versioning for alias "
1722                          "run-time tests.\n");
1723       return false;
1724     }
1725
1726   /* This pass will decide on using loop versioning and/or loop peeling in
1727      order to enhance the alignment of data references in the loop.  */
1728
1729   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1730   if (!ok)
1731     {
1732       if (dump_enabled_p ())
1733         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1734                          "bad data alignment.\n");
1735       return false;
1736     }
1737
1738   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1739   ok = vect_analyze_slp (loop_vinfo, NULL);
1740   if (ok)
1741     {
1742       /* Decide which possible SLP instances to SLP.  */
1743       slp = vect_make_slp_decision (loop_vinfo);
1744
1745       /* Find stmts that need to be both vectorized and SLPed.  */
1746       vect_detect_hybrid_slp (loop_vinfo);
1747     }
1748   else
1749     return false;
1750
1751   /* Scan all the operations in the loop and make sure they are
1752      vectorizable.  */
1753
1754   ok = vect_analyze_loop_operations (loop_vinfo, slp);
1755   if (!ok)
1756     {
1757       if (dump_enabled_p ())
1758         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1759                          "bad operation or unsupported loop bound.\n");
1760       return false;
1761     }
1762
1763   return true;
1764 }
1765
1766 /* Function vect_analyze_loop.
1767
1768    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1769    for it.  The different analyses will record information in the
1770    loop_vec_info struct.  */
1771 loop_vec_info
1772 vect_analyze_loop (struct loop *loop)
1773 {
1774   loop_vec_info loop_vinfo;
1775   unsigned int vector_sizes;
1776
1777   /* Autodetect first vector size we try.  */
1778   current_vector_size = 0;
1779   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1780
1781   if (dump_enabled_p ())
1782     dump_printf_loc (MSG_NOTE, vect_location,
1783                      "===== analyze_loop_nest =====\n");
1784
1785   if (loop_outer (loop)
1786       && loop_vec_info_for_loop (loop_outer (loop))
1787       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1788     {
1789       if (dump_enabled_p ())
1790         dump_printf_loc (MSG_NOTE, vect_location,
1791                          "outer-loop already vectorized.\n");
1792       return NULL;
1793     }
1794
1795   while (1)
1796     {
1797       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1798       loop_vinfo = vect_analyze_loop_form (loop);
1799       if (!loop_vinfo)
1800         {
1801           if (dump_enabled_p ())
1802             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1803                              "bad loop form.\n");
1804           return NULL;
1805         }
1806
1807       if (vect_analyze_loop_2 (loop_vinfo))
1808         {
1809           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1810
1811           return loop_vinfo;
1812         }
1813
1814       destroy_loop_vec_info (loop_vinfo, true);
1815
1816       vector_sizes &= ~current_vector_size;
1817       if (vector_sizes == 0
1818           || current_vector_size == 0)
1819         return NULL;
1820
1821       /* Try the next biggest vector size.  */
1822       current_vector_size = 1 << floor_log2 (vector_sizes);
1823       if (dump_enabled_p ())
1824         dump_printf_loc (MSG_NOTE, vect_location,
1825                          "***** Re-trying analysis with "
1826                          "vector size %d\n", current_vector_size);
1827     }
1828 }
1829
1830
1831 /* Function reduction_code_for_scalar_code
1832
1833    Input:
1834    CODE - tree_code of a reduction operations.
1835
1836    Output:
1837    REDUC_CODE - the corresponding tree-code to be used to reduce the
1838       vector of partial results into a single scalar result (which
1839       will also reside in a vector) or ERROR_MARK if the operation is
1840       a supported reduction operation, but does not have such tree-code.
1841
1842    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1843
1844 static bool
1845 reduction_code_for_scalar_code (enum tree_code code,
1846                                 enum tree_code *reduc_code)
1847 {
1848   switch (code)
1849     {
1850       case MAX_EXPR:
1851         *reduc_code = REDUC_MAX_EXPR;
1852         return true;
1853
1854       case MIN_EXPR:
1855         *reduc_code = REDUC_MIN_EXPR;
1856         return true;
1857
1858       case PLUS_EXPR:
1859         *reduc_code = REDUC_PLUS_EXPR;
1860         return true;
1861
1862       case MULT_EXPR:
1863       case MINUS_EXPR:
1864       case BIT_IOR_EXPR:
1865       case BIT_XOR_EXPR:
1866       case BIT_AND_EXPR:
1867         *reduc_code = ERROR_MARK;
1868         return true;
1869
1870       default:
1871        return false;
1872     }
1873 }
1874
1875
1876 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1877    STMT is printed with a message MSG. */
1878
1879 static void
1880 report_vect_op (int msg_type, gimple stmt, const char *msg)
1881 {
1882   dump_printf_loc (msg_type, vect_location, "%s", msg);
1883   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1884   dump_printf (msg_type, "\n");
1885 }
1886
1887
1888 /* Detect SLP reduction of the form:
1889
1890    #a1 = phi <a5, a0>
1891    a2 = operation (a1)
1892    a3 = operation (a2)
1893    a4 = operation (a3)
1894    a5 = operation (a4)
1895
1896    #a = phi <a5>
1897
1898    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1899    FIRST_STMT is the first reduction stmt in the chain
1900    (a2 = operation (a1)).
1901
1902    Return TRUE if a reduction chain was detected.  */
1903
1904 static bool
1905 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1906 {
1907   struct loop *loop = (gimple_bb (phi))->loop_father;
1908   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1909   enum tree_code code;
1910   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1911   stmt_vec_info use_stmt_info, current_stmt_info;
1912   tree lhs;
1913   imm_use_iterator imm_iter;
1914   use_operand_p use_p;
1915   int nloop_uses, size = 0, n_out_of_loop_uses;
1916   bool found = false;
1917
1918   if (loop != vect_loop)
1919     return false;
1920
1921   lhs = PHI_RESULT (phi);
1922   code = gimple_assign_rhs_code (first_stmt);
1923   while (1)
1924     {
1925       nloop_uses = 0;
1926       n_out_of_loop_uses = 0;
1927       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1928         {
1929           gimple use_stmt = USE_STMT (use_p);
1930           if (is_gimple_debug (use_stmt))
1931             continue;
1932
1933           use_stmt = USE_STMT (use_p);
1934
1935           /* Check if we got back to the reduction phi.  */
1936           if (use_stmt == phi)
1937             {
1938               loop_use_stmt = use_stmt;
1939               found = true;
1940               break;
1941             }
1942
1943           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1944             {
1945               if (vinfo_for_stmt (use_stmt)
1946                   && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1947                 {
1948                   loop_use_stmt = use_stmt;
1949                   nloop_uses++;
1950                 }
1951             }
1952            else
1953              n_out_of_loop_uses++;
1954
1955            /* There are can be either a single use in the loop or two uses in
1956               phi nodes.  */
1957            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1958              return false;
1959         }
1960
1961       if (found)
1962         break;
1963
1964       /* We reached a statement with no loop uses.  */
1965       if (nloop_uses == 0)
1966         return false;
1967
1968       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
1969       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1970         return false;
1971
1972       if (!is_gimple_assign (loop_use_stmt)
1973           || code != gimple_assign_rhs_code (loop_use_stmt)
1974           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1975         return false;
1976
1977       /* Insert USE_STMT into reduction chain.  */
1978       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
1979       if (current_stmt)
1980         {
1981           current_stmt_info = vinfo_for_stmt (current_stmt);
1982           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
1983           GROUP_FIRST_ELEMENT (use_stmt_info)
1984             = GROUP_FIRST_ELEMENT (current_stmt_info);
1985         }
1986       else
1987         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
1988
1989       lhs = gimple_assign_lhs (loop_use_stmt);
1990       current_stmt = loop_use_stmt;
1991       size++;
1992    }
1993
1994   if (!found || loop_use_stmt != phi || size < 2)
1995     return false;
1996
1997   /* Swap the operands, if needed, to make the reduction operand be the second
1998      operand.  */
1999   lhs = PHI_RESULT (phi);
2000   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2001   while (next_stmt)
2002     {
2003       if (gimple_assign_rhs2 (next_stmt) == lhs)
2004         {
2005           tree op = gimple_assign_rhs1 (next_stmt);
2006           gimple def_stmt = NULL;
2007
2008           if (TREE_CODE (op) == SSA_NAME)
2009             def_stmt = SSA_NAME_DEF_STMT (op);
2010
2011           /* Check that the other def is either defined in the loop
2012              ("vect_internal_def"), or it's an induction (defined by a
2013              loop-header phi-node).  */
2014           if (def_stmt
2015               && gimple_bb (def_stmt)
2016               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2017               && (is_gimple_assign (def_stmt)
2018                   || is_gimple_call (def_stmt)
2019                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2020                            == vect_induction_def
2021                   || (gimple_code (def_stmt) == GIMPLE_PHI
2022                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2023                                   == vect_internal_def
2024                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2025             {
2026               lhs = gimple_assign_lhs (next_stmt);
2027               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2028               continue;
2029             }
2030
2031           return false;
2032         }
2033       else
2034         {
2035           tree op = gimple_assign_rhs2 (next_stmt);
2036           gimple def_stmt = NULL;
2037
2038           if (TREE_CODE (op) == SSA_NAME)
2039             def_stmt = SSA_NAME_DEF_STMT (op);
2040
2041           /* Check that the other def is either defined in the loop
2042             ("vect_internal_def"), or it's an induction (defined by a
2043             loop-header phi-node).  */
2044           if (def_stmt
2045               && gimple_bb (def_stmt)
2046               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2047               && (is_gimple_assign (def_stmt)
2048                   || is_gimple_call (def_stmt)
2049                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2050                               == vect_induction_def
2051                   || (gimple_code (def_stmt) == GIMPLE_PHI
2052                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2053                                   == vect_internal_def
2054                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2055             {
2056               if (dump_enabled_p ())
2057                 {
2058                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2059                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2060                   dump_printf (MSG_NOTE, "\n");
2061                 }
2062
2063               swap_ssa_operands (next_stmt,
2064                                  gimple_assign_rhs1_ptr (next_stmt),
2065                                  gimple_assign_rhs2_ptr (next_stmt));
2066               update_stmt (next_stmt);
2067
2068               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2069                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2070             }
2071           else
2072             return false;
2073         }
2074
2075       lhs = gimple_assign_lhs (next_stmt);
2076       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2077     }
2078
2079   /* Save the chain for further analysis in SLP detection.  */
2080   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2081   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2082   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2083
2084   return true;
2085 }
2086
2087
2088 /* Function vect_is_simple_reduction_1
2089
2090    (1) Detect a cross-iteration def-use cycle that represents a simple
2091    reduction computation.  We look for the following pattern:
2092
2093    loop_header:
2094      a1 = phi < a0, a2 >
2095      a3 = ...
2096      a2 = operation (a3, a1)
2097
2098    or
2099
2100    a3 = ...
2101    loop_header:
2102      a1 = phi < a0, a2 >
2103      a2 = operation (a3, a1)
2104
2105    such that:
2106    1. operation is commutative and associative and it is safe to
2107       change the order of the computation (if CHECK_REDUCTION is true)
2108    2. no uses for a2 in the loop (a2 is used out of the loop)
2109    3. no uses of a1 in the loop besides the reduction operation
2110    4. no uses of a1 outside the loop.
2111
2112    Conditions 1,4 are tested here.
2113    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2114
2115    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2116    nested cycles, if CHECK_REDUCTION is false.
2117
2118    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2119    reductions:
2120
2121      a1 = phi < a0, a2 >
2122      inner loop (def of a3)
2123      a2 = phi < a3 >
2124
2125    If MODIFY is true it tries also to rework the code in-place to enable
2126    detection of more reduction patterns.  For the time being we rewrite
2127    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2128 */
2129
2130 static gimple
2131 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2132                             bool check_reduction, bool *double_reduc,
2133                             bool modify)
2134 {
2135   struct loop *loop = (gimple_bb (phi))->loop_father;
2136   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2137   edge latch_e = loop_latch_edge (loop);
2138   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2139   gimple def_stmt, def1 = NULL, def2 = NULL;
2140   enum tree_code orig_code, code;
2141   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2142   tree type;
2143   int nloop_uses;
2144   tree name;
2145   imm_use_iterator imm_iter;
2146   use_operand_p use_p;
2147   bool phi_def;
2148
2149   *double_reduc = false;
2150
2151   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2152      otherwise, we assume outer loop vectorization.  */
2153   gcc_assert ((check_reduction && loop == vect_loop)
2154               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2155
2156   name = PHI_RESULT (phi);
2157   nloop_uses = 0;
2158   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2159     {
2160       gimple use_stmt = USE_STMT (use_p);
2161       if (is_gimple_debug (use_stmt))
2162         continue;
2163
2164       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2165         {
2166           if (dump_enabled_p ())
2167             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2168                              "intermediate value used outside loop.\n");
2169
2170           return NULL;
2171         }
2172
2173       if (vinfo_for_stmt (use_stmt)
2174           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2175         nloop_uses++;
2176       if (nloop_uses > 1)
2177         {
2178           if (dump_enabled_p ())
2179             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2180                              "reduction used in loop.\n");
2181           return NULL;
2182         }
2183     }
2184
2185   if (TREE_CODE (loop_arg) != SSA_NAME)
2186     {
2187       if (dump_enabled_p ())
2188         {
2189           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2190                            "reduction: not ssa_name: ");
2191           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2192           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2193         }
2194       return NULL;
2195     }
2196
2197   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2198   if (!def_stmt)
2199     {
2200       if (dump_enabled_p ())
2201         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2202                          "reduction: no def_stmt.\n");
2203       return NULL;
2204     }
2205
2206   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2207     {
2208       if (dump_enabled_p ())
2209         {
2210           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2211           dump_printf (MSG_NOTE, "\n");
2212         }
2213       return NULL;
2214     }
2215
2216   if (is_gimple_assign (def_stmt))
2217     {
2218       name = gimple_assign_lhs (def_stmt);
2219       phi_def = false;
2220     }
2221   else
2222     {
2223       name = PHI_RESULT (def_stmt);
2224       phi_def = true;
2225     }
2226
2227   nloop_uses = 0;
2228   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2229     {
2230       gimple use_stmt = USE_STMT (use_p);
2231       if (is_gimple_debug (use_stmt))
2232         continue;
2233       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2234           && vinfo_for_stmt (use_stmt)
2235           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2236         nloop_uses++;
2237       if (nloop_uses > 1)
2238         {
2239           if (dump_enabled_p ())
2240             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2241                              "reduction used in loop.\n");
2242           return NULL;
2243         }
2244     }
2245
2246   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2247      defined in the inner loop.  */
2248   if (phi_def)
2249     {
2250       op1 = PHI_ARG_DEF (def_stmt, 0);
2251
2252       if (gimple_phi_num_args (def_stmt) != 1
2253           || TREE_CODE (op1) != SSA_NAME)
2254         {
2255           if (dump_enabled_p ())
2256             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2257                              "unsupported phi node definition.\n");
2258
2259           return NULL;
2260         }
2261
2262       def1 = SSA_NAME_DEF_STMT (op1);
2263       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2264           && loop->inner
2265           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2266           && is_gimple_assign (def1))
2267         {
2268           if (dump_enabled_p ())
2269             report_vect_op (MSG_NOTE, def_stmt,
2270                             "detected double reduction: ");
2271
2272           *double_reduc = true;
2273           return def_stmt;
2274         }
2275
2276       return NULL;
2277     }
2278
2279   code = orig_code = gimple_assign_rhs_code (def_stmt);
2280
2281   /* We can handle "res -= x[i]", which is non-associative by
2282      simply rewriting this into "res += -x[i]".  Avoid changing
2283      gimple instruction for the first simple tests and only do this
2284      if we're allowed to change code at all.  */
2285   if (code == MINUS_EXPR
2286       && modify
2287       && (op1 = gimple_assign_rhs1 (def_stmt))
2288       && TREE_CODE (op1) == SSA_NAME
2289       && SSA_NAME_DEF_STMT (op1) == phi)
2290     code = PLUS_EXPR;
2291
2292   if (check_reduction
2293       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2294     {
2295       if (dump_enabled_p ())
2296         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2297                         "reduction: not commutative/associative: ");
2298       return NULL;
2299     }
2300
2301   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2302     {
2303       if (code != COND_EXPR)
2304         {
2305           if (dump_enabled_p ())
2306             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2307                             "reduction: not binary operation: ");
2308
2309           return NULL;
2310         }
2311
2312       op3 = gimple_assign_rhs1 (def_stmt);
2313       if (COMPARISON_CLASS_P (op3))
2314         {
2315           op4 = TREE_OPERAND (op3, 1);
2316           op3 = TREE_OPERAND (op3, 0);
2317         }
2318
2319       op1 = gimple_assign_rhs2 (def_stmt);
2320       op2 = gimple_assign_rhs3 (def_stmt);
2321
2322       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2323         {
2324           if (dump_enabled_p ())
2325             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2326                             "reduction: uses not ssa_names: ");
2327
2328           return NULL;
2329         }
2330     }
2331   else
2332     {
2333       op1 = gimple_assign_rhs1 (def_stmt);
2334       op2 = gimple_assign_rhs2 (def_stmt);
2335
2336       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2337         {
2338           if (dump_enabled_p ())
2339             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2340                             "reduction: uses not ssa_names: ");
2341
2342           return NULL;
2343         }
2344    }
2345
2346   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2347   if ((TREE_CODE (op1) == SSA_NAME
2348        && !types_compatible_p (type,TREE_TYPE (op1)))
2349       || (TREE_CODE (op2) == SSA_NAME
2350           && !types_compatible_p (type, TREE_TYPE (op2)))
2351       || (op3 && TREE_CODE (op3) == SSA_NAME
2352           && !types_compatible_p (type, TREE_TYPE (op3)))
2353       || (op4 && TREE_CODE (op4) == SSA_NAME
2354           && !types_compatible_p (type, TREE_TYPE (op4))))
2355     {
2356       if (dump_enabled_p ())
2357         {
2358           dump_printf_loc (MSG_NOTE, vect_location,
2359                            "reduction: multiple types: operation type: ");
2360           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2361           dump_printf (MSG_NOTE, ", operands types: ");
2362           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2363                              TREE_TYPE (op1));
2364           dump_printf (MSG_NOTE, ",");
2365           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2366                              TREE_TYPE (op2));
2367           if (op3)
2368             {
2369               dump_printf (MSG_NOTE, ",");
2370               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2371                                  TREE_TYPE (op3));
2372             }
2373
2374           if (op4)
2375             {
2376               dump_printf (MSG_NOTE, ",");
2377               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2378                                  TREE_TYPE (op4));
2379             }
2380           dump_printf (MSG_NOTE, "\n");
2381         }
2382
2383       return NULL;
2384     }
2385
2386   /* Check that it's ok to change the order of the computation.
2387      Generally, when vectorizing a reduction we change the order of the
2388      computation.  This may change the behavior of the program in some
2389      cases, so we need to check that this is ok.  One exception is when
2390      vectorizing an outer-loop: the inner-loop is executed sequentially,
2391      and therefore vectorizing reductions in the inner-loop during
2392      outer-loop vectorization is safe.  */
2393
2394   /* CHECKME: check for !flag_finite_math_only too?  */
2395   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2396       && check_reduction)
2397     {
2398       /* Changing the order of operations changes the semantics.  */
2399       if (dump_enabled_p ())
2400         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2401                         "reduction: unsafe fp math optimization: ");
2402       return NULL;
2403     }
2404   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2405            && check_reduction)
2406     {
2407       /* Changing the order of operations changes the semantics.  */
2408       if (dump_enabled_p ())
2409         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2410                         "reduction: unsafe int math optimization: ");
2411       return NULL;
2412     }
2413   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2414     {
2415       /* Changing the order of operations changes the semantics.  */
2416       if (dump_enabled_p ())
2417         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2418                         "reduction: unsafe fixed-point math optimization: ");
2419       return NULL;
2420     }
2421
2422   /* If we detected "res -= x[i]" earlier, rewrite it into
2423      "res += -x[i]" now.  If this turns out to be useless reassoc
2424      will clean it up again.  */
2425   if (orig_code == MINUS_EXPR)
2426     {
2427       tree rhs = gimple_assign_rhs2 (def_stmt);
2428       tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2429       gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2430                                                          rhs, NULL);
2431       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2432       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2433                                                           loop_info, NULL));
2434       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2435       gimple_assign_set_rhs2 (def_stmt, negrhs);
2436       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2437       update_stmt (def_stmt);
2438     }
2439
2440   /* Reduction is safe. We're dealing with one of the following:
2441      1) integer arithmetic and no trapv
2442      2) floating point arithmetic, and special flags permit this optimization
2443      3) nested cycle (i.e., outer loop vectorization).  */
2444   if (TREE_CODE (op1) == SSA_NAME)
2445     def1 = SSA_NAME_DEF_STMT (op1);
2446
2447   if (TREE_CODE (op2) == SSA_NAME)
2448     def2 = SSA_NAME_DEF_STMT (op2);
2449
2450   if (code != COND_EXPR
2451       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2452     {
2453       if (dump_enabled_p ())
2454         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2455       return NULL;
2456     }
2457
2458   /* Check that one def is the reduction def, defined by PHI,
2459      the other def is either defined in the loop ("vect_internal_def"),
2460      or it's an induction (defined by a loop-header phi-node).  */
2461
2462   if (def2 && def2 == phi
2463       && (code == COND_EXPR
2464           || !def1 || gimple_nop_p (def1)
2465           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2466           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2467               && (is_gimple_assign (def1)
2468                   || is_gimple_call (def1)
2469                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2470                       == vect_induction_def
2471                   || (gimple_code (def1) == GIMPLE_PHI
2472                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2473                           == vect_internal_def
2474                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2475     {
2476       if (dump_enabled_p ())
2477         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2478       return def_stmt;
2479     }
2480
2481   if (def1 && def1 == phi
2482       && (code == COND_EXPR
2483           || !def2 || gimple_nop_p (def2)
2484           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2485           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2486               && (is_gimple_assign (def2)
2487                   || is_gimple_call (def2)
2488                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2489                       == vect_induction_def
2490                   || (gimple_code (def2) == GIMPLE_PHI
2491                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2492                           == vect_internal_def
2493                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2494     {
2495       if (check_reduction)
2496         {
2497           /* Swap operands (just for simplicity - so that the rest of the code
2498              can assume that the reduction variable is always the last (second)
2499              argument).  */
2500           if (dump_enabled_p ())
2501             report_vect_op (MSG_NOTE, def_stmt,
2502                             "detected reduction: need to swap operands: ");
2503
2504           swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2505                              gimple_assign_rhs2_ptr (def_stmt));
2506
2507           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2508             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2509         }
2510       else
2511         {
2512           if (dump_enabled_p ())
2513             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2514         }
2515
2516       return def_stmt;
2517     }
2518
2519   /* Try to find SLP reduction chain.  */
2520   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2521     {
2522       if (dump_enabled_p ())
2523         report_vect_op (MSG_NOTE, def_stmt,
2524                         "reduction: detected reduction chain: ");
2525
2526       return def_stmt;
2527     }
2528
2529   if (dump_enabled_p ())
2530     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2531                     "reduction: unknown pattern: ");
2532
2533   return NULL;
2534 }
2535
2536 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2537    in-place.  Arguments as there.  */
2538
2539 static gimple
2540 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2541                           bool check_reduction, bool *double_reduc)
2542 {
2543   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2544                                      double_reduc, false);
2545 }
2546
2547 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2548    in-place if it enables detection of more reductions.  Arguments
2549    as there.  */
2550
2551 gimple
2552 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2553                           bool check_reduction, bool *double_reduc)
2554 {
2555   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2556                                      double_reduc, true);
2557 }
2558
2559 /* Calculate the cost of one scalar iteration of the loop.  */
2560 int
2561 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2562 {
2563   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2564   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2565   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2566   int innerloop_iters, i, stmt_cost;
2567
2568   /* Count statements in scalar loop.  Using this as scalar cost for a single
2569      iteration for now.
2570
2571      TODO: Add outer loop support.
2572
2573      TODO: Consider assigning different costs to different scalar
2574      statements.  */
2575
2576   /* FORNOW.  */
2577   innerloop_iters = 1;
2578   if (loop->inner)
2579     innerloop_iters = 50; /* FIXME */
2580
2581   for (i = 0; i < nbbs; i++)
2582     {
2583       gimple_stmt_iterator si;
2584       basic_block bb = bbs[i];
2585
2586       if (bb->loop_father == loop->inner)
2587         factor = innerloop_iters;
2588       else
2589         factor = 1;
2590
2591       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2592         {
2593           gimple stmt = gsi_stmt (si);
2594           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2595
2596           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2597             continue;
2598
2599           /* Skip stmts that are not vectorized inside the loop.  */
2600           if (stmt_info
2601               && !STMT_VINFO_RELEVANT_P (stmt_info)
2602               && (!STMT_VINFO_LIVE_P (stmt_info)
2603                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2604               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2605             continue;
2606
2607           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2608             {
2609               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2610                stmt_cost = vect_get_stmt_cost (scalar_load);
2611              else
2612                stmt_cost = vect_get_stmt_cost (scalar_store);
2613             }
2614           else
2615             stmt_cost = vect_get_stmt_cost (scalar_stmt);
2616
2617           scalar_single_iter_cost += stmt_cost * factor;
2618         }
2619     }
2620   return scalar_single_iter_cost;
2621 }
2622
2623 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2624 int
2625 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2626                              int *peel_iters_epilogue,
2627                              int scalar_single_iter_cost,
2628                              stmt_vector_for_cost *prologue_cost_vec,
2629                              stmt_vector_for_cost *epilogue_cost_vec)
2630 {
2631   int retval = 0;
2632   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2633
2634   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2635     {
2636       *peel_iters_epilogue = vf/2;
2637       if (dump_enabled_p ())
2638         dump_printf_loc (MSG_NOTE, vect_location,
2639                          "cost model: epilogue peel iters set to vf/2 "
2640                          "because loop iterations are unknown .\n");
2641
2642       /* If peeled iterations are known but number of scalar loop
2643          iterations are unknown, count a taken branch per peeled loop.  */
2644       retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2645                                  NULL, 0, vect_prologue);
2646     }
2647   else
2648     {
2649       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2650       peel_iters_prologue = niters < peel_iters_prologue ?
2651                             niters : peel_iters_prologue;
2652       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2653       /* If we need to peel for gaps, but no peeling is required, we have to
2654          peel VF iterations.  */
2655       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2656         *peel_iters_epilogue = vf;
2657     }
2658
2659   if (peel_iters_prologue)
2660     retval += record_stmt_cost (prologue_cost_vec,
2661                                 peel_iters_prologue * scalar_single_iter_cost,
2662                                 scalar_stmt, NULL, 0, vect_prologue);
2663   if (*peel_iters_epilogue)
2664     retval += record_stmt_cost (epilogue_cost_vec,
2665                                 *peel_iters_epilogue * scalar_single_iter_cost,
2666                                 scalar_stmt, NULL, 0, vect_epilogue);
2667   return retval;
2668 }
2669
2670 /* Function vect_estimate_min_profitable_iters
2671
2672    Return the number of iterations required for the vector version of the
2673    loop to be profitable relative to the cost of the scalar version of the
2674    loop.  */
2675
2676 static void
2677 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2678                                     int *ret_min_profitable_niters,
2679                                     int *ret_min_profitable_estimate)
2680 {
2681   int min_profitable_iters;
2682   int min_profitable_estimate;
2683   int peel_iters_prologue;
2684   int peel_iters_epilogue;
2685   unsigned vec_inside_cost = 0;
2686   int vec_outside_cost = 0;
2687   unsigned vec_prologue_cost = 0;
2688   unsigned vec_epilogue_cost = 0;
2689   int scalar_single_iter_cost = 0;
2690   int scalar_outside_cost = 0;
2691   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2692   int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2693   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2694
2695   /* Cost model disabled.  */
2696   if (unlimited_cost_model ())
2697     {
2698       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2699       *ret_min_profitable_niters = 0;
2700       *ret_min_profitable_estimate = 0;
2701       return;
2702     }
2703
2704   /* Requires loop versioning tests to handle misalignment.  */
2705   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2706     {
2707       /*  FIXME: Make cost depend on complexity of individual check.  */
2708       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2709       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2710                             vect_prologue);
2711       dump_printf (MSG_NOTE,
2712                    "cost model: Adding cost of checks for loop "
2713                    "versioning to treat misalignment.\n");
2714     }
2715
2716   /* Requires loop versioning with alias checks.  */
2717   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2718     {
2719       /*  FIXME: Make cost depend on complexity of individual check.  */
2720       unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2721       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2722                             vect_prologue);
2723       dump_printf (MSG_NOTE,
2724                    "cost model: Adding cost of checks for loop "
2725                    "versioning aliasing.\n");
2726     }
2727
2728   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2729       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2730     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2731                           vect_prologue);
2732
2733   /* Count statements in scalar loop.  Using this as scalar cost for a single
2734      iteration for now.
2735
2736      TODO: Add outer loop support.
2737
2738      TODO: Consider assigning different costs to different scalar
2739      statements.  */
2740
2741   scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2742
2743   /* Add additional cost for the peeled instructions in prologue and epilogue
2744      loop.
2745
2746      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2747      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2748
2749      TODO: Build an expression that represents peel_iters for prologue and
2750      epilogue to be used in a run-time test.  */
2751
2752   if (npeel  < 0)
2753     {
2754       peel_iters_prologue = vf/2;
2755       dump_printf (MSG_NOTE, "cost model: "
2756                    "prologue peel iters set to vf/2.\n");
2757
2758       /* If peeling for alignment is unknown, loop bound of main loop becomes
2759          unknown.  */
2760       peel_iters_epilogue = vf/2;
2761       dump_printf (MSG_NOTE, "cost model: "
2762                    "epilogue peel iters set to vf/2 because "
2763                    "peeling for alignment is unknown.\n");
2764
2765       /* If peeled iterations are unknown, count a taken branch and a not taken
2766          branch per peeled loop. Even if scalar loop iterations are known,
2767          vector iterations are not known since peeled prologue iterations are
2768          not known. Hence guards remain the same.  */
2769       (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2770                             NULL, 0, vect_prologue);
2771       (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2772                             NULL, 0, vect_prologue);
2773       /* FORNOW: Don't attempt to pass individual scalar instructions to
2774          the model; just assume linear cost for scalar iterations.  */
2775       (void) add_stmt_cost (target_cost_data,
2776                             peel_iters_prologue * scalar_single_iter_cost,
2777                             scalar_stmt, NULL, 0, vect_prologue);
2778       (void) add_stmt_cost (target_cost_data,
2779                             peel_iters_epilogue * scalar_single_iter_cost,
2780                             scalar_stmt, NULL, 0, vect_epilogue);
2781     }
2782   else
2783     {
2784       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2785       stmt_info_for_cost *si;
2786       int j;
2787       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2788
2789       prologue_cost_vec.create (2);
2790       epilogue_cost_vec.create (2);
2791       peel_iters_prologue = npeel;
2792
2793       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2794                                           &peel_iters_epilogue,
2795                                           scalar_single_iter_cost,
2796                                           &prologue_cost_vec,
2797                                           &epilogue_cost_vec);
2798
2799       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2800         {
2801           struct _stmt_vec_info *stmt_info
2802             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2803           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2804                                 si->misalign, vect_prologue);
2805         }
2806
2807       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2808         {
2809           struct _stmt_vec_info *stmt_info
2810             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2811           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2812                                 si->misalign, vect_epilogue);
2813         }
2814
2815       prologue_cost_vec.release ();
2816       epilogue_cost_vec.release ();
2817     }
2818
2819   /* FORNOW: The scalar outside cost is incremented in one of the
2820      following ways:
2821
2822      1. The vectorizer checks for alignment and aliasing and generates
2823      a condition that allows dynamic vectorization.  A cost model
2824      check is ANDED with the versioning condition.  Hence scalar code
2825      path now has the added cost of the versioning check.
2826
2827        if (cost > th & versioning_check)
2828          jmp to vector code
2829
2830      Hence run-time scalar is incremented by not-taken branch cost.
2831
2832      2. The vectorizer then checks if a prologue is required.  If the
2833      cost model check was not done before during versioning, it has to
2834      be done before the prologue check.
2835
2836        if (cost <= th)
2837          prologue = scalar_iters
2838        if (prologue == 0)
2839          jmp to vector code
2840        else
2841          execute prologue
2842        if (prologue == num_iters)
2843          go to exit
2844
2845      Hence the run-time scalar cost is incremented by a taken branch,
2846      plus a not-taken branch, plus a taken branch cost.
2847
2848      3. The vectorizer then checks if an epilogue is required.  If the
2849      cost model check was not done before during prologue check, it
2850      has to be done with the epilogue check.
2851
2852        if (prologue == 0)
2853          jmp to vector code
2854        else
2855          execute prologue
2856        if (prologue == num_iters)
2857          go to exit
2858        vector code:
2859          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2860            jmp to epilogue
2861
2862      Hence the run-time scalar cost should be incremented by 2 taken
2863      branches.
2864
2865      TODO: The back end may reorder the BBS's differently and reverse
2866      conditions/branch directions.  Change the estimates below to
2867      something more reasonable.  */
2868
2869   /* If the number of iterations is known and we do not do versioning, we can
2870      decide whether to vectorize at compile time.  Hence the scalar version
2871      do not carry cost model guard costs.  */
2872   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2873       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2874       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2875     {
2876       /* Cost model check occurs at versioning.  */
2877       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2878           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2879         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2880       else
2881         {
2882           /* Cost model check occurs at prologue generation.  */
2883           if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2884             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2885               + vect_get_stmt_cost (cond_branch_not_taken);
2886           /* Cost model check occurs at epilogue generation.  */
2887           else
2888             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2889         }
2890     }
2891
2892   /* Complete the target-specific cost calculations.  */
2893   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2894                &vec_inside_cost, &vec_epilogue_cost);
2895
2896   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2897
2898   /* Calculate number of iterations required to make the vector version
2899      profitable, relative to the loop bodies only.  The following condition
2900      must hold true:
2901      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2902      where
2903      SIC = scalar iteration cost, VIC = vector iteration cost,
2904      VOC = vector outside cost, VF = vectorization factor,
2905      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2906      SOC = scalar outside cost for run time cost model check.  */
2907
2908   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2909     {
2910       if (vec_outside_cost <= 0)
2911         min_profitable_iters = 1;
2912       else
2913         {
2914           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2915                                   - vec_inside_cost * peel_iters_prologue
2916                                   - vec_inside_cost * peel_iters_epilogue)
2917                                  / ((scalar_single_iter_cost * vf)
2918                                     - vec_inside_cost);
2919
2920           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2921               <= (((int) vec_inside_cost * min_profitable_iters)
2922                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2923             min_profitable_iters++;
2924         }
2925     }
2926   /* vector version will never be profitable.  */
2927   else
2928     {
2929       if (dump_enabled_p ())
2930         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2931                          "cost model: the vector iteration cost = %d "
2932                          "divided by the scalar iteration cost = %d "
2933                          "is greater or equal to the vectorization factor = %d"
2934                          ".\n",
2935                          vec_inside_cost, scalar_single_iter_cost, vf);
2936       *ret_min_profitable_niters = -1;
2937       *ret_min_profitable_estimate = -1;
2938       return;
2939     }
2940
2941   if (dump_enabled_p ())
2942     {
2943       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2944       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
2945                    vec_inside_cost);
2946       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
2947                    vec_prologue_cost);
2948       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
2949                    vec_epilogue_cost);
2950       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
2951                    scalar_single_iter_cost);
2952       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
2953                    scalar_outside_cost);
2954       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
2955                    vec_outside_cost);
2956       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
2957                    peel_iters_prologue);
2958       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
2959                    peel_iters_epilogue);
2960       dump_printf (MSG_NOTE,
2961                    "  Calculated minimum iters for profitability: %d\n",
2962                    min_profitable_iters);
2963       dump_printf (MSG_NOTE, "\n");
2964     }
2965
2966   min_profitable_iters =
2967         min_profitable_iters < vf ? vf : min_profitable_iters;
2968
2969   /* Because the condition we create is:
2970      if (niters <= min_profitable_iters)
2971        then skip the vectorized loop.  */
2972   min_profitable_iters--;
2973
2974   if (dump_enabled_p ())
2975     dump_printf_loc (MSG_NOTE, vect_location,
2976                      "  Runtime profitability threshold = %d\n",
2977                      min_profitable_iters);
2978
2979   *ret_min_profitable_niters = min_profitable_iters;
2980
2981   /* Calculate number of iterations required to make the vector version
2982      profitable, relative to the loop bodies only.
2983
2984      Non-vectorized variant is SIC * niters and it must win over vector
2985      variant on the expected loop trip count.  The following condition must hold true:
2986      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
2987
2988   if (vec_outside_cost <= 0)
2989     min_profitable_estimate = 1;
2990   else
2991     {
2992       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
2993                                  - vec_inside_cost * peel_iters_prologue
2994                                  - vec_inside_cost * peel_iters_epilogue)
2995                                  / ((scalar_single_iter_cost * vf)
2996                                    - vec_inside_cost);
2997     }
2998   min_profitable_estimate --;
2999   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3000   if (dump_enabled_p ())
3001     dump_printf_loc (MSG_NOTE, vect_location,
3002                      "  Static estimate profitability threshold = %d\n",
3003                       min_profitable_iters);
3004
3005   *ret_min_profitable_estimate = min_profitable_estimate;
3006 }
3007
3008
3009 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3010    functions. Design better to avoid maintenance issues.  */
3011
3012 /* Function vect_model_reduction_cost.
3013
3014    Models cost for a reduction operation, including the vector ops
3015    generated within the strip-mine loop, the initial definition before
3016    the loop, and the epilogue code that must be generated.  */
3017
3018 static bool
3019 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3020                            int ncopies)
3021 {
3022   int prologue_cost = 0, epilogue_cost = 0;
3023   enum tree_code code;
3024   optab optab;
3025   tree vectype;
3026   gimple stmt, orig_stmt;
3027   tree reduction_op;
3028   enum machine_mode mode;
3029   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3030   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3031   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3032
3033   /* Cost of reduction op inside loop.  */
3034   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3035                                         stmt_info, 0, vect_body);
3036   stmt = STMT_VINFO_STMT (stmt_info);
3037
3038   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3039     {
3040     case GIMPLE_SINGLE_RHS:
3041       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
3042       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
3043       break;
3044     case GIMPLE_UNARY_RHS:
3045       reduction_op = gimple_assign_rhs1 (stmt);
3046       break;
3047     case GIMPLE_BINARY_RHS:
3048       reduction_op = gimple_assign_rhs2 (stmt);
3049       break;
3050     case GIMPLE_TERNARY_RHS:
3051       reduction_op = gimple_assign_rhs3 (stmt);
3052       break;
3053     default:
3054       gcc_unreachable ();
3055     }
3056
3057   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3058   if (!vectype)
3059     {
3060       if (dump_enabled_p ())
3061         {
3062           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3063                            "unsupported data-type ");
3064           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3065                              TREE_TYPE (reduction_op));
3066           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3067         }
3068       return false;
3069    }
3070
3071   mode = TYPE_MODE (vectype);
3072   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3073
3074   if (!orig_stmt)
3075     orig_stmt = STMT_VINFO_STMT (stmt_info);
3076
3077   code = gimple_assign_rhs_code (orig_stmt);
3078
3079   /* Add in cost for initial definition.  */
3080   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3081                                   stmt_info, 0, vect_prologue);
3082
3083   /* Determine cost of epilogue code.
3084
3085      We have a reduction operator that will reduce the vector in one statement.
3086      Also requires scalar extract.  */
3087
3088   if (!nested_in_vect_loop_p (loop, orig_stmt))
3089     {
3090       if (reduc_code != ERROR_MARK)
3091         {
3092           epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3093                                           stmt_info, 0, vect_epilogue);
3094           epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3095                                           stmt_info, 0, vect_epilogue);
3096         }
3097       else
3098         {
3099           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3100           tree bitsize =
3101             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3102           int element_bitsize = tree_to_uhwi (bitsize);
3103           int nelements = vec_size_in_bits / element_bitsize;
3104
3105           optab = optab_for_tree_code (code, vectype, optab_default);
3106
3107           /* We have a whole vector shift available.  */
3108           if (VECTOR_MODE_P (mode)
3109               && optab_handler (optab, mode) != CODE_FOR_nothing
3110               && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3111             {
3112               /* Final reduction via vector shifts and the reduction operator.
3113                  Also requires scalar extract.  */
3114               epilogue_cost += add_stmt_cost (target_cost_data,
3115                                               exact_log2 (nelements) * 2,
3116                                               vector_stmt, stmt_info, 0,
3117                                               vect_epilogue);
3118               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3119                                               vec_to_scalar, stmt_info, 0,
3120                                               vect_epilogue);
3121             }
3122           else
3123             /* Use extracts and reduction op for final reduction.  For N
3124                elements, we have N extracts and N-1 reduction ops.  */
3125             epilogue_cost += add_stmt_cost (target_cost_data,
3126                                             nelements + nelements - 1,
3127                                             vector_stmt, stmt_info, 0,
3128                                             vect_epilogue);
3129         }
3130     }
3131
3132   if (dump_enabled_p ())
3133     dump_printf (MSG_NOTE,
3134                  "vect_model_reduction_cost: inside_cost = %d, "
3135                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3136                  prologue_cost, epilogue_cost);
3137
3138   return true;
3139 }
3140
3141
3142 /* Function vect_model_induction_cost.
3143
3144    Models cost for induction operations.  */
3145
3146 static void
3147 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3148 {
3149   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3150   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3151   unsigned inside_cost, prologue_cost;
3152
3153   /* loop cost for vec_loop.  */
3154   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3155                                stmt_info, 0, vect_body);
3156
3157   /* prologue cost for vec_init and vec_step.  */
3158   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3159                                  stmt_info, 0, vect_prologue);
3160
3161   if (dump_enabled_p ())
3162     dump_printf_loc (MSG_NOTE, vect_location,
3163                      "vect_model_induction_cost: inside_cost = %d, "
3164                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3165 }
3166
3167
3168 /* Function get_initial_def_for_induction
3169
3170    Input:
3171    STMT - a stmt that performs an induction operation in the loop.
3172    IV_PHI - the initial value of the induction variable
3173
3174    Output:
3175    Return a vector variable, initialized with the first VF values of
3176    the induction variable.  E.g., for an iv with IV_PHI='X' and
3177    evolution S, for a vector of 4 units, we want to return:
3178    [X, X + S, X + 2*S, X + 3*S].  */
3179
3180 static tree
3181 get_initial_def_for_induction (gimple iv_phi)
3182 {
3183   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3184   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3185   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3186   tree vectype;
3187   int nunits;
3188   edge pe = loop_preheader_edge (loop);
3189   struct loop *iv_loop;
3190   basic_block new_bb;
3191   tree new_vec, vec_init, vec_step, t;
3192   tree access_fn;
3193   tree new_var;
3194   tree new_name;
3195   gimple init_stmt, induction_phi, new_stmt;
3196   tree induc_def, vec_def, vec_dest;
3197   tree init_expr, step_expr;
3198   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3199   int i;
3200   bool ok;
3201   int ncopies;
3202   tree expr;
3203   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3204   bool nested_in_vect_loop = false;
3205   gimple_seq stmts = NULL;
3206   imm_use_iterator imm_iter;
3207   use_operand_p use_p;
3208   gimple exit_phi;
3209   edge latch_e;
3210   tree loop_arg;
3211   gimple_stmt_iterator si;
3212   basic_block bb = gimple_bb (iv_phi);
3213   tree stepvectype;
3214   tree resvectype;
3215
3216   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3217   if (nested_in_vect_loop_p (loop, iv_phi))
3218     {
3219       nested_in_vect_loop = true;
3220       iv_loop = loop->inner;
3221     }
3222   else
3223     iv_loop = loop;
3224   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3225
3226   latch_e = loop_latch_edge (iv_loop);
3227   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3228
3229   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
3230   gcc_assert (access_fn);
3231   STRIP_NOPS (access_fn);
3232   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
3233                                     &init_expr, &step_expr);
3234   gcc_assert (ok);
3235   pe = loop_preheader_edge (iv_loop);
3236
3237   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3238   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3239   gcc_assert (vectype);
3240   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3241   ncopies = vf / nunits;
3242
3243   gcc_assert (phi_info);
3244   gcc_assert (ncopies >= 1);
3245
3246   /* Find the first insertion point in the BB.  */
3247   si = gsi_after_labels (bb);
3248
3249   /* Create the vector that holds the initial_value of the induction.  */
3250   if (nested_in_vect_loop)
3251     {
3252       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3253          been created during vectorization of previous stmts.  We obtain it
3254          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3255       tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3256                                            loop_preheader_edge (iv_loop));
3257       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
3258       /* If the initial value is not of proper type, convert it.  */
3259       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3260         {
3261           new_stmt = gimple_build_assign_with_ops
3262               (VIEW_CONVERT_EXPR,
3263                vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
3264                build1 (VIEW_CONVERT_EXPR, vectype, vec_init), NULL_TREE);
3265           vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3266           gimple_assign_set_lhs (new_stmt, vec_init);
3267           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3268                                                  new_stmt);
3269           gcc_assert (!new_bb);
3270           set_vinfo_for_stmt (new_stmt,
3271                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3272         }
3273     }
3274   else
3275     {
3276       vec<constructor_elt, va_gc> *v;
3277
3278       /* iv_loop is the loop to be vectorized. Create:
3279          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3280       new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3281                                        vect_scalar_var, "var_");
3282       new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3283                                                      init_expr),
3284                                        &stmts, false, new_var);
3285       if (stmts)
3286         {
3287           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3288           gcc_assert (!new_bb);
3289         }
3290
3291       vec_alloc (v, nunits);
3292       bool constant_p = is_gimple_min_invariant (new_name);
3293       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3294       for (i = 1; i < nunits; i++)
3295         {
3296           /* Create: new_name_i = new_name + step_expr  */
3297           new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3298                                   new_name, step_expr);
3299           if (!is_gimple_min_invariant (new_name))
3300             {
3301               init_stmt = gimple_build_assign (new_var, new_name);
3302               new_name = make_ssa_name (new_var, init_stmt);
3303               gimple_assign_set_lhs (init_stmt, new_name);
3304               new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3305               gcc_assert (!new_bb);
3306               if (dump_enabled_p ())
3307                 {
3308                   dump_printf_loc (MSG_NOTE, vect_location,
3309                                    "created new init_stmt: ");
3310                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3311                   dump_printf (MSG_NOTE, "\n");
3312                 }
3313               constant_p = false;
3314             }
3315           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3316         }
3317       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3318       if (constant_p)
3319         new_vec = build_vector_from_ctor (vectype, v);
3320       else
3321         new_vec = build_constructor (vectype, v);
3322       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3323     }
3324
3325
3326   /* Create the vector that holds the step of the induction.  */
3327   if (nested_in_vect_loop)
3328     /* iv_loop is nested in the loop to be vectorized. Generate:
3329        vec_step = [S, S, S, S]  */
3330     new_name = step_expr;
3331   else
3332     {
3333       /* iv_loop is the loop to be vectorized. Generate:
3334           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3335       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3336         {
3337           expr = build_int_cst (integer_type_node, vf);
3338           expr = fold_convert (TREE_TYPE (step_expr), expr);
3339         }
3340       else
3341         expr = build_int_cst (TREE_TYPE (step_expr), vf);
3342       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3343                               expr, step_expr);
3344       if (TREE_CODE (step_expr) == SSA_NAME)
3345         new_name = vect_init_vector (iv_phi, new_name,
3346                                      TREE_TYPE (step_expr), NULL);
3347     }
3348
3349   t = unshare_expr (new_name);
3350   gcc_assert (CONSTANT_CLASS_P (new_name)
3351               || TREE_CODE (new_name) == SSA_NAME);
3352   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3353   gcc_assert (stepvectype);
3354   new_vec = build_vector_from_val (stepvectype, t);
3355   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3356
3357
3358   /* Create the following def-use cycle:
3359      loop prolog:
3360          vec_init = ...
3361          vec_step = ...
3362      loop:
3363          vec_iv = PHI <vec_init, vec_loop>
3364          ...
3365          STMT
3366          ...
3367          vec_loop = vec_iv + vec_step;  */
3368
3369   /* Create the induction-phi that defines the induction-operand.  */
3370   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3371   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3372   set_vinfo_for_stmt (induction_phi,
3373                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3374   induc_def = PHI_RESULT (induction_phi);
3375
3376   /* Create the iv update inside the loop  */
3377   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3378                                            induc_def, vec_step);
3379   vec_def = make_ssa_name (vec_dest, new_stmt);
3380   gimple_assign_set_lhs (new_stmt, vec_def);
3381   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3382   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3383                                                    NULL));
3384
3385   /* Set the arguments of the phi node:  */
3386   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3387   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3388                UNKNOWN_LOCATION);
3389
3390
3391   /* In case that vectorization factor (VF) is bigger than the number
3392      of elements that we can fit in a vectype (nunits), we have to generate
3393      more than one vector stmt - i.e - we need to "unroll" the
3394      vector stmt by a factor VF/nunits.  For more details see documentation
3395      in vectorizable_operation.  */
3396
3397   if (ncopies > 1)
3398     {
3399       stmt_vec_info prev_stmt_vinfo;
3400       /* FORNOW. This restriction should be relaxed.  */
3401       gcc_assert (!nested_in_vect_loop);
3402
3403       /* Create the vector that holds the step of the induction.  */
3404       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3405         {
3406           expr = build_int_cst (integer_type_node, nunits);
3407           expr = fold_convert (TREE_TYPE (step_expr), expr);
3408         }
3409       else
3410         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3411       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3412                               expr, step_expr);
3413       if (TREE_CODE (step_expr) == SSA_NAME)
3414         new_name = vect_init_vector (iv_phi, new_name,
3415                                      TREE_TYPE (step_expr), NULL);
3416       t = unshare_expr (new_name);
3417       gcc_assert (CONSTANT_CLASS_P (new_name)
3418                   || TREE_CODE (new_name) == SSA_NAME);
3419       new_vec = build_vector_from_val (stepvectype, t);
3420       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3421
3422       vec_def = induc_def;
3423       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3424       for (i = 1; i < ncopies; i++)
3425         {
3426           /* vec_i = vec_prev + vec_step  */
3427           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3428                                                    vec_def, vec_step);
3429           vec_def = make_ssa_name (vec_dest, new_stmt);
3430           gimple_assign_set_lhs (new_stmt, vec_def);
3431
3432           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3433           if (!useless_type_conversion_p (resvectype, vectype))
3434             {
3435               new_stmt = gimple_build_assign_with_ops
3436                   (VIEW_CONVERT_EXPR,
3437                    vect_get_new_vect_var (resvectype, vect_simple_var,
3438                                           "vec_iv_"),
3439                    build1 (VIEW_CONVERT_EXPR, resvectype,
3440                            gimple_assign_lhs (new_stmt)), NULL_TREE);
3441               gimple_assign_set_lhs (new_stmt,
3442                                      make_ssa_name
3443                                        (gimple_assign_lhs (new_stmt), new_stmt));
3444               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3445             }
3446           set_vinfo_for_stmt (new_stmt,
3447                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3448           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3449           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3450         }
3451     }
3452
3453   if (nested_in_vect_loop)
3454     {
3455       /* Find the loop-closed exit-phi of the induction, and record
3456          the final vector of induction results:  */
3457       exit_phi = NULL;
3458       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3459         {
3460           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3461             {
3462               exit_phi = USE_STMT (use_p);
3463               break;
3464             }
3465         }
3466       if (exit_phi)
3467         {
3468           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3469           /* FORNOW. Currently not supporting the case that an inner-loop induction
3470              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3471           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3472                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3473
3474           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3475           if (dump_enabled_p ())
3476             {
3477               dump_printf_loc (MSG_NOTE, vect_location,
3478                                "vector of inductions after inner-loop:");
3479               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3480               dump_printf (MSG_NOTE, "\n");
3481             }
3482         }
3483     }
3484
3485
3486   if (dump_enabled_p ())
3487     {
3488       dump_printf_loc (MSG_NOTE, vect_location,
3489                        "transform induction: created def-use cycle: ");
3490       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3491       dump_printf (MSG_NOTE, "\n");
3492       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3493                         SSA_NAME_DEF_STMT (vec_def), 0);
3494       dump_printf (MSG_NOTE, "\n");
3495     }
3496
3497   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3498   if (!useless_type_conversion_p (resvectype, vectype))
3499     {
3500       new_stmt = gimple_build_assign_with_ops
3501          (VIEW_CONVERT_EXPR,
3502           vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3503           build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3504       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3505       gimple_assign_set_lhs (new_stmt, induc_def);
3506       si = gsi_after_labels (bb);
3507       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3508       set_vinfo_for_stmt (new_stmt,
3509                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3510       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3511         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3512     }
3513
3514   return induc_def;
3515 }
3516
3517
3518 /* Function get_initial_def_for_reduction
3519
3520    Input:
3521    STMT - a stmt that performs a reduction operation in the loop.
3522    INIT_VAL - the initial value of the reduction variable
3523
3524    Output:
3525    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3526         of the reduction (used for adjusting the epilog - see below).
3527    Return a vector variable, initialized according to the operation that STMT
3528         performs. This vector will be used as the initial value of the
3529         vector of partial results.
3530
3531    Option1 (adjust in epilog): Initialize the vector as follows:
3532      add/bit or/xor:    [0,0,...,0,0]
3533      mult/bit and:      [1,1,...,1,1]
3534      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3535    and when necessary (e.g. add/mult case) let the caller know
3536    that it needs to adjust the result by init_val.
3537
3538    Option2: Initialize the vector as follows:
3539      add/bit or/xor:    [init_val,0,0,...,0]
3540      mult/bit and:      [init_val,1,1,...,1]
3541      min/max/cond_expr: [init_val,init_val,...,init_val]
3542    and no adjustments are needed.
3543
3544    For example, for the following code:
3545
3546    s = init_val;
3547    for (i=0;i<n;i++)
3548      s = s + a[i];
3549
3550    STMT is 's = s + a[i]', and the reduction variable is 's'.
3551    For a vector of 4 units, we want to return either [0,0,0,init_val],
3552    or [0,0,0,0] and let the caller know that it needs to adjust
3553    the result at the end by 'init_val'.
3554
3555    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3556    initialization vector is simpler (same element in all entries), if
3557    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3558
3559    A cost model should help decide between these two schemes.  */
3560
3561 tree
3562 get_initial_def_for_reduction (gimple stmt, tree init_val,
3563                                tree *adjustment_def)
3564 {
3565   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3566   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3567   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3568   tree scalar_type = TREE_TYPE (init_val);
3569   tree vectype = get_vectype_for_scalar_type (scalar_type);
3570   int nunits;
3571   enum tree_code code = gimple_assign_rhs_code (stmt);
3572   tree def_for_init;
3573   tree init_def;
3574   tree *elts;
3575   int i;
3576   bool nested_in_vect_loop = false;
3577   tree init_value;
3578   REAL_VALUE_TYPE real_init_val = dconst0;
3579   int int_init_val = 0;
3580   gimple def_stmt = NULL;
3581
3582   gcc_assert (vectype);
3583   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3584
3585   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3586               || SCALAR_FLOAT_TYPE_P (scalar_type));
3587
3588   if (nested_in_vect_loop_p (loop, stmt))
3589     nested_in_vect_loop = true;
3590   else
3591     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3592
3593   /* In case of double reduction we only create a vector variable to be put
3594      in the reduction phi node.  The actual statement creation is done in
3595      vect_create_epilog_for_reduction.  */
3596   if (adjustment_def && nested_in_vect_loop
3597       && TREE_CODE (init_val) == SSA_NAME
3598       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3599       && gimple_code (def_stmt) == GIMPLE_PHI
3600       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3601       && vinfo_for_stmt (def_stmt)
3602       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3603           == vect_double_reduction_def)
3604     {
3605       *adjustment_def = NULL;
3606       return vect_create_destination_var (init_val, vectype);
3607     }
3608
3609   if (TREE_CONSTANT (init_val))
3610     {
3611       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3612         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3613       else
3614         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3615     }
3616   else
3617     init_value = init_val;
3618
3619   switch (code)
3620     {
3621       case WIDEN_SUM_EXPR:
3622       case DOT_PROD_EXPR:
3623       case PLUS_EXPR:
3624       case MINUS_EXPR:
3625       case BIT_IOR_EXPR:
3626       case BIT_XOR_EXPR:
3627       case MULT_EXPR:
3628       case BIT_AND_EXPR:
3629         /* ADJUSMENT_DEF is NULL when called from
3630            vect_create_epilog_for_reduction to vectorize double reduction.  */
3631         if (adjustment_def)
3632           {
3633             if (nested_in_vect_loop)
3634               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3635                                                               NULL);
3636             else
3637               *adjustment_def = init_val;
3638           }
3639
3640         if (code == MULT_EXPR)
3641           {
3642             real_init_val = dconst1;
3643             int_init_val = 1;
3644           }
3645
3646         if (code == BIT_AND_EXPR)
3647           int_init_val = -1;
3648
3649         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3650           def_for_init = build_real (scalar_type, real_init_val);
3651         else
3652           def_for_init = build_int_cst (scalar_type, int_init_val);
3653
3654         /* Create a vector of '0' or '1' except the first element.  */
3655         elts = XALLOCAVEC (tree, nunits);
3656         for (i = nunits - 2; i >= 0; --i)
3657           elts[i + 1] = def_for_init;
3658
3659         /* Option1: the first element is '0' or '1' as well.  */
3660         if (adjustment_def)
3661           {
3662             elts[0] = def_for_init;
3663             init_def = build_vector (vectype, elts);
3664             break;
3665           }
3666
3667         /* Option2: the first element is INIT_VAL.  */
3668         elts[0] = init_val;
3669         if (TREE_CONSTANT (init_val))
3670           init_def = build_vector (vectype, elts);
3671         else
3672           {
3673             vec<constructor_elt, va_gc> *v;
3674             vec_alloc (v, nunits);
3675             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3676             for (i = 1; i < nunits; ++i)
3677               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3678             init_def = build_constructor (vectype, v);
3679           }
3680
3681         break;
3682
3683       case MIN_EXPR:
3684       case MAX_EXPR:
3685       case COND_EXPR:
3686         if (adjustment_def)
3687           {
3688             *adjustment_def = NULL_TREE;
3689             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3690             break;
3691           }
3692
3693         init_def = build_vector_from_val (vectype, init_value);
3694         break;
3695
3696       default:
3697         gcc_unreachable ();
3698     }
3699
3700   return init_def;
3701 }
3702
3703
3704 /* Function vect_create_epilog_for_reduction
3705
3706    Create code at the loop-epilog to finalize the result of a reduction
3707    computation.
3708
3709    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3710      reduction statements.
3711    STMT is the scalar reduction stmt that is being vectorized.
3712    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3713      number of elements that we can fit in a vectype (nunits).  In this case
3714      we have to generate more than one vector stmt - i.e - we need to "unroll"
3715      the vector stmt by a factor VF/nunits.  For more details see documentation
3716      in vectorizable_operation.
3717    REDUC_CODE is the tree-code for the epilog reduction.
3718    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3719      computation.
3720    REDUC_INDEX is the index of the operand in the right hand side of the
3721      statement that is defined by REDUCTION_PHI.
3722    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3723    SLP_NODE is an SLP node containing a group of reduction statements. The
3724      first one in this group is STMT.
3725
3726    This function:
3727    1. Creates the reduction def-use cycles: sets the arguments for
3728       REDUCTION_PHIS:
3729       The loop-entry argument is the vectorized initial-value of the reduction.
3730       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3731       sums.
3732    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3733       by applying the operation specified by REDUC_CODE if available, or by
3734       other means (whole-vector shifts or a scalar loop).
3735       The function also creates a new phi node at the loop exit to preserve
3736       loop-closed form, as illustrated below.
3737
3738      The flow at the entry to this function:
3739
3740         loop:
3741           vec_def = phi <null, null>            # REDUCTION_PHI
3742           VECT_DEF = vector_stmt                # vectorized form of STMT
3743           s_loop = scalar_stmt                  # (scalar) STMT
3744         loop_exit:
3745           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3746           use <s_out0>
3747           use <s_out0>
3748
3749      The above is transformed by this function into:
3750
3751         loop:
3752           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3753           VECT_DEF = vector_stmt                # vectorized form of STMT
3754           s_loop = scalar_stmt                  # (scalar) STMT
3755         loop_exit:
3756           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3757           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3758           v_out2 = reduce <v_out1>
3759           s_out3 = extract_field <v_out2, 0>
3760           s_out4 = adjust_result <s_out3>
3761           use <s_out4>
3762           use <s_out4>
3763 */
3764
3765 static void
3766 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3767                                   int ncopies, enum tree_code reduc_code,
3768                                   vec<gimple> reduction_phis,
3769                                   int reduc_index, bool double_reduc,
3770                                   slp_tree slp_node)
3771 {
3772   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3773   stmt_vec_info prev_phi_info;
3774   tree vectype;
3775   enum machine_mode mode;
3776   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3777   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3778   basic_block exit_bb;
3779   tree scalar_dest;
3780   tree scalar_type;
3781   gimple new_phi = NULL, phi;
3782   gimple_stmt_iterator exit_gsi;
3783   tree vec_dest;
3784   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3785   gimple epilog_stmt = NULL;
3786   enum tree_code code = gimple_assign_rhs_code (stmt);
3787   gimple exit_phi;
3788   tree bitsize, bitpos;
3789   tree adjustment_def = NULL;
3790   tree vec_initial_def = NULL;
3791   tree reduction_op, expr, def;
3792   tree orig_name, scalar_result;
3793   imm_use_iterator imm_iter, phi_imm_iter;
3794   use_operand_p use_p, phi_use_p;
3795   bool extract_scalar_result = false;
3796   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3797   bool nested_in_vect_loop = false;
3798   vec<gimple> new_phis = vNULL;
3799   vec<gimple> inner_phis = vNULL;
3800   enum vect_def_type dt = vect_unknown_def_type;
3801   int j, i;
3802   vec<tree> scalar_results = vNULL;
3803   unsigned int group_size = 1, k, ratio;
3804   vec<tree> vec_initial_defs = vNULL;
3805   vec<gimple> phis;
3806   bool slp_reduc = false;
3807   tree new_phi_result;
3808   gimple inner_phi = NULL;
3809
3810   if (slp_node)
3811     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3812
3813   if (nested_in_vect_loop_p (loop, stmt))
3814     {
3815       outer_loop = loop;
3816       loop = loop->inner;
3817       nested_in_vect_loop = true;
3818       gcc_assert (!slp_node);
3819     }
3820
3821   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3822     {
3823     case GIMPLE_SINGLE_RHS:
3824       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3825                   == ternary_op);
3826       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3827       break;
3828     case GIMPLE_UNARY_RHS:
3829       reduction_op = gimple_assign_rhs1 (stmt);
3830       break;
3831     case GIMPLE_BINARY_RHS:
3832       reduction_op = reduc_index ?
3833                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3834       break;
3835     case GIMPLE_TERNARY_RHS:
3836       reduction_op = gimple_op (stmt, reduc_index + 1);
3837       break;
3838     default:
3839       gcc_unreachable ();
3840     }
3841
3842   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3843   gcc_assert (vectype);
3844   mode = TYPE_MODE (vectype);
3845
3846   /* 1. Create the reduction def-use cycle:
3847      Set the arguments of REDUCTION_PHIS, i.e., transform
3848
3849         loop:
3850           vec_def = phi <null, null>            # REDUCTION_PHI
3851           VECT_DEF = vector_stmt                # vectorized form of STMT
3852           ...
3853
3854      into:
3855
3856         loop:
3857           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3858           VECT_DEF = vector_stmt                # vectorized form of STMT
3859           ...
3860
3861      (in case of SLP, do it for all the phis). */
3862
3863   /* Get the loop-entry arguments.  */
3864   if (slp_node)
3865     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3866                        NULL, slp_node, reduc_index);
3867   else
3868     {
3869       vec_initial_defs.create (1);
3870      /* For the case of reduction, vect_get_vec_def_for_operand returns
3871         the scalar def before the loop, that defines the initial value
3872         of the reduction variable.  */
3873       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3874                                                       &adjustment_def);
3875       vec_initial_defs.quick_push (vec_initial_def);
3876     }
3877
3878   /* Set phi nodes arguments.  */
3879   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
3880     {
3881       tree vec_init_def = vec_initial_defs[i];
3882       tree def = vect_defs[i];
3883       for (j = 0; j < ncopies; j++)
3884         {
3885           /* Set the loop-entry arg of the reduction-phi.  */
3886           add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3887                        UNKNOWN_LOCATION);
3888
3889           /* Set the loop-latch arg for the reduction-phi.  */
3890           if (j > 0)
3891             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3892
3893           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3894
3895           if (dump_enabled_p ())
3896             {
3897               dump_printf_loc (MSG_NOTE, vect_location,
3898                                "transform reduction: created def-use cycle: ");
3899               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3900               dump_printf (MSG_NOTE, "\n");
3901               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
3902               dump_printf (MSG_NOTE, "\n");
3903             }
3904
3905           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3906         }
3907     }
3908
3909   vec_initial_defs.release ();
3910
3911   /* 2. Create epilog code.
3912         The reduction epilog code operates across the elements of the vector
3913         of partial results computed by the vectorized loop.
3914         The reduction epilog code consists of:
3915
3916         step 1: compute the scalar result in a vector (v_out2)
3917         step 2: extract the scalar result (s_out3) from the vector (v_out2)
3918         step 3: adjust the scalar result (s_out3) if needed.
3919
3920         Step 1 can be accomplished using one the following three schemes:
3921           (scheme 1) using reduc_code, if available.
3922           (scheme 2) using whole-vector shifts, if available.
3923           (scheme 3) using a scalar loop. In this case steps 1+2 above are
3924                      combined.
3925
3926           The overall epilog code looks like this:
3927
3928           s_out0 = phi <s_loop>         # original EXIT_PHI
3929           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
3930           v_out2 = reduce <v_out1>              # step 1
3931           s_out3 = extract_field <v_out2, 0>    # step 2
3932           s_out4 = adjust_result <s_out3>       # step 3
3933
3934           (step 3 is optional, and steps 1 and 2 may be combined).
3935           Lastly, the uses of s_out0 are replaced by s_out4.  */
3936
3937
3938   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3939          v_out1 = phi <VECT_DEF>
3940          Store them in NEW_PHIS.  */
3941
3942   exit_bb = single_exit (loop)->dest;
3943   prev_phi_info = NULL;
3944   new_phis.create (vect_defs.length ());
3945   FOR_EACH_VEC_ELT (vect_defs, i, def)
3946     {
3947       for (j = 0; j < ncopies; j++)
3948         {
3949           tree new_def = copy_ssa_name (def, NULL);
3950           phi = create_phi_node (new_def, exit_bb);
3951           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3952           if (j == 0)
3953             new_phis.quick_push (phi);
3954           else
3955             {
3956               def = vect_get_vec_def_for_stmt_copy (dt, def);
3957               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3958             }
3959
3960           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3961           prev_phi_info = vinfo_for_stmt (phi);
3962         }
3963     }
3964
3965   /* The epilogue is created for the outer-loop, i.e., for the loop being
3966      vectorized.  Create exit phis for the outer loop.  */
3967   if (double_reduc)
3968     {
3969       loop = outer_loop;
3970       exit_bb = single_exit (loop)->dest;
3971       inner_phis.create (vect_defs.length ());
3972       FOR_EACH_VEC_ELT (new_phis, i, phi)
3973         {
3974           tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3975           gimple outer_phi = create_phi_node (new_result, exit_bb);
3976           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3977                            PHI_RESULT (phi));
3978           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3979                                                             loop_vinfo, NULL));
3980           inner_phis.quick_push (phi);
3981           new_phis[i] = outer_phi;
3982           prev_phi_info = vinfo_for_stmt (outer_phi);
3983           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
3984             {
3985               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3986               new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3987               outer_phi = create_phi_node (new_result, exit_bb);
3988               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3989                                PHI_RESULT (phi));
3990               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3991                                                         loop_vinfo, NULL));
3992               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
3993               prev_phi_info = vinfo_for_stmt (outer_phi);
3994             }
3995         }
3996     }
3997
3998   exit_gsi = gsi_after_labels (exit_bb);
3999
4000   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4001          (i.e. when reduc_code is not available) and in the final adjustment
4002          code (if needed).  Also get the original scalar reduction variable as
4003          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4004          represents a reduction pattern), the tree-code and scalar-def are
4005          taken from the original stmt that the pattern-stmt (STMT) replaces.
4006          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4007          are taken from STMT.  */
4008
4009   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4010   if (!orig_stmt)
4011     {
4012       /* Regular reduction  */
4013       orig_stmt = stmt;
4014     }
4015   else
4016     {
4017       /* Reduction pattern  */
4018       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4019       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4020       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4021     }
4022
4023   code = gimple_assign_rhs_code (orig_stmt);
4024   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4025      partial results are added and not subtracted.  */
4026   if (code == MINUS_EXPR)
4027     code = PLUS_EXPR;
4028
4029   scalar_dest = gimple_assign_lhs (orig_stmt);
4030   scalar_type = TREE_TYPE (scalar_dest);
4031   scalar_results.create (group_size);
4032   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4033   bitsize = TYPE_SIZE (scalar_type);
4034
4035   /* In case this is a reduction in an inner-loop while vectorizing an outer
4036      loop - we don't need to extract a single scalar result at the end of the
4037      inner-loop (unless it is double reduction, i.e., the use of reduction is
4038      outside the outer-loop).  The final vector of partial results will be used
4039      in the vectorized outer-loop, or reduced to a scalar result at the end of
4040      the outer-loop.  */
4041   if (nested_in_vect_loop && !double_reduc)
4042     goto vect_finalize_reduction;
4043
4044   /* SLP reduction without reduction chain, e.g.,
4045      # a1 = phi <a2, a0>
4046      # b1 = phi <b2, b0>
4047      a2 = operation (a1)
4048      b2 = operation (b1)  */
4049   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4050
4051   /* In case of reduction chain, e.g.,
4052      # a1 = phi <a3, a0>
4053      a2 = operation (a1)
4054      a3 = operation (a2),
4055
4056      we may end up with more than one vector result.  Here we reduce them to
4057      one vector.  */
4058   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4059     {
4060       tree first_vect = PHI_RESULT (new_phis[0]);
4061       tree tmp;
4062       gimple new_vec_stmt = NULL;
4063
4064       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4065       for (k = 1; k < new_phis.length (); k++)
4066         {
4067           gimple next_phi = new_phis[k];
4068           tree second_vect = PHI_RESULT (next_phi);
4069
4070           tmp = build2 (code, vectype,  first_vect, second_vect);
4071           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4072           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4073           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4074           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4075         }
4076
4077       new_phi_result = first_vect;
4078       if (new_vec_stmt)
4079         {
4080           new_phis.truncate (0);
4081           new_phis.safe_push (new_vec_stmt);
4082         }
4083     }
4084   else
4085     new_phi_result = PHI_RESULT (new_phis[0]);
4086
4087   /* 2.3 Create the reduction code, using one of the three schemes described
4088          above. In SLP we simply need to extract all the elements from the
4089          vector (without reducing them), so we use scalar shifts.  */
4090   if (reduc_code != ERROR_MARK && !slp_reduc)
4091     {
4092       tree tmp;
4093
4094       /*** Case 1:  Create:
4095            v_out2 = reduc_expr <v_out1>  */
4096
4097       if (dump_enabled_p ())
4098         dump_printf_loc (MSG_NOTE, vect_location,
4099                          "Reduce using direct vector reduction.\n");
4100
4101       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4102       tmp = build1 (reduc_code, vectype, new_phi_result);
4103       epilog_stmt = gimple_build_assign (vec_dest, tmp);
4104       new_temp = make_ssa_name (vec_dest, epilog_stmt);
4105       gimple_assign_set_lhs (epilog_stmt, new_temp);
4106       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4107
4108       extract_scalar_result = true;
4109     }
4110   else
4111     {
4112       enum tree_code shift_code = ERROR_MARK;
4113       bool have_whole_vector_shift = true;
4114       int bit_offset;
4115       int element_bitsize = tree_to_uhwi (bitsize);
4116       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4117       tree vec_temp;
4118
4119       if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4120         shift_code = VEC_RSHIFT_EXPR;
4121       else
4122         have_whole_vector_shift = false;
4123
4124       /* Regardless of whether we have a whole vector shift, if we're
4125          emulating the operation via tree-vect-generic, we don't want
4126          to use it.  Only the first round of the reduction is likely
4127          to still be profitable via emulation.  */
4128       /* ??? It might be better to emit a reduction tree code here, so that
4129          tree-vect-generic can expand the first round via bit tricks.  */
4130       if (!VECTOR_MODE_P (mode))
4131         have_whole_vector_shift = false;
4132       else
4133         {
4134           optab optab = optab_for_tree_code (code, vectype, optab_default);
4135           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4136             have_whole_vector_shift = false;
4137         }
4138
4139       if (have_whole_vector_shift && !slp_reduc)
4140         {
4141           /*** Case 2: Create:
4142              for (offset = VS/2; offset >= element_size; offset/=2)
4143                 {
4144                   Create:  va' = vec_shift <va, offset>
4145                   Create:  va = vop <va, va'>
4146                 }  */
4147
4148           if (dump_enabled_p ())
4149             dump_printf_loc (MSG_NOTE, vect_location,
4150                              "Reduce using vector shifts\n");
4151
4152           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4153           new_temp = new_phi_result;
4154           for (bit_offset = vec_size_in_bits/2;
4155                bit_offset >= element_bitsize;
4156                bit_offset /= 2)
4157             {
4158               tree bitpos = size_int (bit_offset);
4159
4160               epilog_stmt = gimple_build_assign_with_ops (shift_code,
4161                                                vec_dest, new_temp, bitpos);
4162               new_name = make_ssa_name (vec_dest, epilog_stmt);
4163               gimple_assign_set_lhs (epilog_stmt, new_name);
4164               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4165
4166               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4167                                                           new_name, new_temp);
4168               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4169               gimple_assign_set_lhs (epilog_stmt, new_temp);
4170               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4171             }
4172
4173           extract_scalar_result = true;
4174         }
4175       else
4176         {
4177           tree rhs;
4178
4179           /*** Case 3: Create:
4180              s = extract_field <v_out2, 0>
4181              for (offset = element_size;
4182                   offset < vector_size;
4183                   offset += element_size;)
4184                {
4185                  Create:  s' = extract_field <v_out2, offset>
4186                  Create:  s = op <s, s'>  // For non SLP cases
4187                }  */
4188
4189           if (dump_enabled_p ())
4190             dump_printf_loc (MSG_NOTE, vect_location,
4191                              "Reduce using scalar code.\n");
4192
4193           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4194           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4195             {
4196               if (gimple_code (new_phi) == GIMPLE_PHI)
4197                 vec_temp = PHI_RESULT (new_phi);
4198               else
4199                 vec_temp = gimple_assign_lhs (new_phi);
4200               rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4201                             bitsize_zero_node);
4202               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4203               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4204               gimple_assign_set_lhs (epilog_stmt, new_temp);
4205               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4206
4207               /* In SLP we don't need to apply reduction operation, so we just
4208                  collect s' values in SCALAR_RESULTS.  */
4209               if (slp_reduc)
4210                 scalar_results.safe_push (new_temp);
4211
4212               for (bit_offset = element_bitsize;
4213                    bit_offset < vec_size_in_bits;
4214                    bit_offset += element_bitsize)
4215                 {
4216                   tree bitpos = bitsize_int (bit_offset);
4217                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4218                                      bitsize, bitpos);
4219
4220                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4221                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4222                   gimple_assign_set_lhs (epilog_stmt, new_name);
4223                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4224
4225                   if (slp_reduc)
4226                     {
4227                       /* In SLP we don't need to apply reduction operation, so
4228                          we just collect s' values in SCALAR_RESULTS.  */
4229                       new_temp = new_name;
4230                       scalar_results.safe_push (new_name);
4231                     }
4232                   else
4233                     {
4234                       epilog_stmt = gimple_build_assign_with_ops (code,
4235                                           new_scalar_dest, new_name, new_temp);
4236                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4237                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4238                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4239                     }
4240                 }
4241             }
4242
4243           /* The only case where we need to reduce scalar results in SLP, is
4244              unrolling.  If the size of SCALAR_RESULTS is greater than
4245              GROUP_SIZE, we reduce them combining elements modulo
4246              GROUP_SIZE.  */
4247           if (slp_reduc)
4248             {
4249               tree res, first_res, new_res;
4250               gimple new_stmt;
4251
4252               /* Reduce multiple scalar results in case of SLP unrolling.  */
4253               for (j = group_size; scalar_results.iterate (j, &res);
4254                    j++)
4255                 {
4256                   first_res = scalar_results[j % group_size];
4257                   new_stmt = gimple_build_assign_with_ops (code,
4258                                               new_scalar_dest, first_res, res);
4259                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4260                   gimple_assign_set_lhs (new_stmt, new_res);
4261                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4262                   scalar_results[j % group_size] = new_res;
4263                 }
4264             }
4265           else
4266             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4267             scalar_results.safe_push (new_temp);
4268
4269           extract_scalar_result = false;
4270         }
4271     }
4272
4273   /* 2.4  Extract the final scalar result.  Create:
4274           s_out3 = extract_field <v_out2, bitpos>  */
4275
4276   if (extract_scalar_result)
4277     {
4278       tree rhs;
4279
4280       if (dump_enabled_p ())
4281         dump_printf_loc (MSG_NOTE, vect_location,
4282                          "extract scalar result\n");
4283
4284       if (BYTES_BIG_ENDIAN)
4285         bitpos = size_binop (MULT_EXPR,
4286                              bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4287                              TYPE_SIZE (scalar_type));
4288       else
4289         bitpos = bitsize_zero_node;
4290
4291       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4292       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4293       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4294       gimple_assign_set_lhs (epilog_stmt, new_temp);
4295       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4296       scalar_results.safe_push (new_temp);
4297     }
4298
4299 vect_finalize_reduction:
4300
4301   if (double_reduc)
4302     loop = loop->inner;
4303
4304   /* 2.5 Adjust the final result by the initial value of the reduction
4305          variable. (When such adjustment is not needed, then
4306          'adjustment_def' is zero).  For example, if code is PLUS we create:
4307          new_temp = loop_exit_def + adjustment_def  */
4308
4309   if (adjustment_def)
4310     {
4311       gcc_assert (!slp_reduc);
4312       if (nested_in_vect_loop)
4313         {
4314           new_phi = new_phis[0];
4315           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4316           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4317           new_dest = vect_create_destination_var (scalar_dest, vectype);
4318         }
4319       else
4320         {
4321           new_temp = scalar_results[0];
4322           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4323           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4324           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4325         }
4326
4327       epilog_stmt = gimple_build_assign (new_dest, expr);
4328       new_temp = make_ssa_name (new_dest, epilog_stmt);
4329       gimple_assign_set_lhs (epilog_stmt, new_temp);
4330       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4331       if (nested_in_vect_loop)
4332         {
4333           set_vinfo_for_stmt (epilog_stmt,
4334                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4335                                                  NULL));
4336           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4337                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4338
4339           if (!double_reduc)
4340             scalar_results.quick_push (new_temp);
4341           else
4342             scalar_results[0] = new_temp;
4343         }
4344       else
4345         scalar_results[0] = new_temp;
4346
4347       new_phis[0] = epilog_stmt;
4348     }
4349
4350   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4351           phis with new adjusted scalar results, i.e., replace use <s_out0>
4352           with use <s_out4>.
4353
4354      Transform:
4355         loop_exit:
4356           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4357           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4358           v_out2 = reduce <v_out1>
4359           s_out3 = extract_field <v_out2, 0>
4360           s_out4 = adjust_result <s_out3>
4361           use <s_out0>
4362           use <s_out0>
4363
4364      into:
4365
4366         loop_exit:
4367           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4368           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4369           v_out2 = reduce <v_out1>
4370           s_out3 = extract_field <v_out2, 0>
4371           s_out4 = adjust_result <s_out3>
4372           use <s_out4>
4373           use <s_out4> */
4374
4375
4376   /* In SLP reduction chain we reduce vector results into one vector if
4377      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4378      the last stmt in the reduction chain, since we are looking for the loop
4379      exit phi node.  */
4380   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4381     {
4382       scalar_dest = gimple_assign_lhs (
4383                         SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4384       group_size = 1;
4385     }
4386
4387   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4388      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4389      need to match SCALAR_RESULTS with corresponding statements.  The first
4390      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4391      the first vector stmt, etc.
4392      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4393   if (group_size > new_phis.length ())
4394     {
4395       ratio = group_size / new_phis.length ();
4396       gcc_assert (!(group_size % new_phis.length ()));
4397     }
4398   else
4399     ratio = 1;
4400
4401   for (k = 0; k < group_size; k++)
4402     {
4403       if (k % ratio == 0)
4404         {
4405           epilog_stmt = new_phis[k / ratio];
4406           reduction_phi = reduction_phis[k / ratio];
4407           if (double_reduc)
4408             inner_phi = inner_phis[k / ratio];
4409         }
4410
4411       if (slp_reduc)
4412         {
4413           gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4414
4415           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4416           /* SLP statements can't participate in patterns.  */
4417           gcc_assert (!orig_stmt);
4418           scalar_dest = gimple_assign_lhs (current_stmt);
4419         }
4420
4421       phis.create (3);
4422       /* Find the loop-closed-use at the loop exit of the original scalar
4423          result.  (The reduction result is expected to have two immediate uses -
4424          one at the latch block, and one at the loop exit).  */
4425       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4426         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4427             && !is_gimple_debug (USE_STMT (use_p)))
4428           phis.safe_push (USE_STMT (use_p));
4429
4430       /* While we expect to have found an exit_phi because of loop-closed-ssa
4431          form we can end up without one if the scalar cycle is dead.  */
4432
4433       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4434         {
4435           if (outer_loop)
4436             {
4437               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4438               gimple vect_phi;
4439
4440               /* FORNOW. Currently not supporting the case that an inner-loop
4441                  reduction is not used in the outer-loop (but only outside the
4442                  outer-loop), unless it is double reduction.  */
4443               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4444                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4445                           || double_reduc);
4446
4447               STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4448               if (!double_reduc
4449                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4450                       != vect_double_reduction_def)
4451                 continue;
4452
4453               /* Handle double reduction:
4454
4455                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4456                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4457                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4458                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4459
4460                  At that point the regular reduction (stmt2 and stmt3) is
4461                  already vectorized, as well as the exit phi node, stmt4.
4462                  Here we vectorize the phi node of double reduction, stmt1, and
4463                  update all relevant statements.  */
4464
4465               /* Go through all the uses of s2 to find double reduction phi
4466                  node, i.e., stmt1 above.  */
4467               orig_name = PHI_RESULT (exit_phi);
4468               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4469                 {
4470                   stmt_vec_info use_stmt_vinfo;
4471                   stmt_vec_info new_phi_vinfo;
4472                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4473                   basic_block bb = gimple_bb (use_stmt);
4474                   gimple use;
4475
4476                   /* Check that USE_STMT is really double reduction phi
4477                      node.  */
4478                   if (gimple_code (use_stmt) != GIMPLE_PHI
4479                       || gimple_phi_num_args (use_stmt) != 2
4480                       || bb->loop_father != outer_loop)
4481                     continue;
4482                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4483                   if (!use_stmt_vinfo
4484                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4485                           != vect_double_reduction_def)
4486                     continue;
4487
4488                   /* Create vector phi node for double reduction:
4489                      vs1 = phi <vs0, vs2>
4490                      vs1 was created previously in this function by a call to
4491                        vect_get_vec_def_for_operand and is stored in
4492                        vec_initial_def;
4493                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4494                      vs0 is created here.  */
4495
4496                   /* Create vector phi node.  */
4497                   vect_phi = create_phi_node (vec_initial_def, bb);
4498                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4499                                     loop_vec_info_for_loop (outer_loop), NULL);
4500                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4501
4502                   /* Create vs0 - initial def of the double reduction phi.  */
4503                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4504                                              loop_preheader_edge (outer_loop));
4505                   init_def = get_initial_def_for_reduction (stmt,
4506                                                           preheader_arg, NULL);
4507                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4508                                                     vectype, NULL);
4509
4510                   /* Update phi node arguments with vs0 and vs2.  */
4511                   add_phi_arg (vect_phi, vect_phi_init,
4512                                loop_preheader_edge (outer_loop),
4513                                UNKNOWN_LOCATION);
4514                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4515                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4516                   if (dump_enabled_p ())
4517                     {
4518                       dump_printf_loc (MSG_NOTE, vect_location,
4519                                        "created double reduction phi node: ");
4520                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4521                       dump_printf (MSG_NOTE, "\n");
4522                     }
4523
4524                   vect_phi_res = PHI_RESULT (vect_phi);
4525
4526                   /* Replace the use, i.e., set the correct vs1 in the regular
4527                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4528                      loop is redundant.  */
4529                   use = reduction_phi;
4530                   for (j = 0; j < ncopies; j++)
4531                     {
4532                       edge pr_edge = loop_preheader_edge (loop);
4533                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4534                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4535                     }
4536                 }
4537             }
4538         }
4539
4540       phis.release ();
4541       if (nested_in_vect_loop)
4542         {
4543           if (double_reduc)
4544             loop = outer_loop;
4545           else
4546             continue;
4547         }
4548
4549       phis.create (3);
4550       /* Find the loop-closed-use at the loop exit of the original scalar
4551          result.  (The reduction result is expected to have two immediate uses,
4552          one at the latch block, and one at the loop exit).  For double
4553          reductions we are looking for exit phis of the outer loop.  */
4554       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4555         {
4556           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4557             {
4558               if (!is_gimple_debug (USE_STMT (use_p)))
4559                 phis.safe_push (USE_STMT (use_p));
4560             }
4561           else
4562             {
4563               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4564                 {
4565                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4566
4567                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4568                     {
4569                       if (!flow_bb_inside_loop_p (loop,
4570                                              gimple_bb (USE_STMT (phi_use_p)))
4571                           && !is_gimple_debug (USE_STMT (phi_use_p)))
4572                         phis.safe_push (USE_STMT (phi_use_p));
4573                     }
4574                 }
4575             }
4576         }
4577
4578       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4579         {
4580           /* Replace the uses:  */
4581           orig_name = PHI_RESULT (exit_phi);
4582           scalar_result = scalar_results[k];
4583           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4584             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4585               SET_USE (use_p, scalar_result);
4586         }
4587
4588       phis.release ();
4589     }
4590
4591   scalar_results.release ();
4592   inner_phis.release ();
4593   new_phis.release ();
4594 }
4595
4596
4597 /* Function vectorizable_reduction.
4598
4599    Check if STMT performs a reduction operation that can be vectorized.
4600    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4601    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4602    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4603
4604    This function also handles reduction idioms (patterns) that have been
4605    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4606    of this form:
4607      X = pattern_expr (arg0, arg1, ..., X)
4608    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4609    sequence that had been detected and replaced by the pattern-stmt (STMT).
4610
4611    In some cases of reduction patterns, the type of the reduction variable X is
4612    different than the type of the other arguments of STMT.
4613    In such cases, the vectype that is used when transforming STMT into a vector
4614    stmt is different than the vectype that is used to determine the
4615    vectorization factor, because it consists of a different number of elements
4616    than the actual number of elements that are being operated upon in parallel.
4617
4618    For example, consider an accumulation of shorts into an int accumulator.
4619    On some targets it's possible to vectorize this pattern operating on 8
4620    shorts at a time (hence, the vectype for purposes of determining the
4621    vectorization factor should be V8HI); on the other hand, the vectype that
4622    is used to create the vector form is actually V4SI (the type of the result).
4623
4624    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4625    indicates what is the actual level of parallelism (V8HI in the example), so
4626    that the right vectorization factor would be derived.  This vectype
4627    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4628    be used to create the vectorized stmt.  The right vectype for the vectorized
4629    stmt is obtained from the type of the result X:
4630         get_vectype_for_scalar_type (TREE_TYPE (X))
4631
4632    This means that, contrary to "regular" reductions (or "regular" stmts in
4633    general), the following equation:
4634       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4635    does *NOT* necessarily hold for reduction patterns.  */
4636
4637 bool
4638 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4639                         gimple *vec_stmt, slp_tree slp_node)
4640 {
4641   tree vec_dest;
4642   tree scalar_dest;
4643   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4644   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4645   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4646   tree vectype_in = NULL_TREE;
4647   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4648   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4649   enum tree_code code, orig_code, epilog_reduc_code;
4650   enum machine_mode vec_mode;
4651   int op_type;
4652   optab optab, reduc_optab;
4653   tree new_temp = NULL_TREE;
4654   tree def;
4655   gimple def_stmt;
4656   enum vect_def_type dt;
4657   gimple new_phi = NULL;
4658   tree scalar_type;
4659   bool is_simple_use;
4660   gimple orig_stmt;
4661   stmt_vec_info orig_stmt_info;
4662   tree expr = NULL_TREE;
4663   int i;
4664   int ncopies;
4665   int epilog_copies;
4666   stmt_vec_info prev_stmt_info, prev_phi_info;
4667   bool single_defuse_cycle = false;
4668   tree reduc_def = NULL_TREE;
4669   gimple new_stmt = NULL;
4670   int j;
4671   tree ops[3];
4672   bool nested_cycle = false, found_nested_cycle_def = false;
4673   gimple reduc_def_stmt = NULL;
4674   /* The default is that the reduction variable is the last in statement.  */
4675   int reduc_index = 2;
4676   bool double_reduc = false, dummy;
4677   basic_block def_bb;
4678   struct loop * def_stmt_loop, *outer_loop = NULL;
4679   tree def_arg;
4680   gimple def_arg_stmt;
4681   vec<tree> vec_oprnds0 = vNULL;
4682   vec<tree> vec_oprnds1 = vNULL;
4683   vec<tree> vect_defs = vNULL;
4684   vec<gimple> phis = vNULL;
4685   int vec_num;
4686   tree def0, def1, tem, op0, op1 = NULL_TREE;
4687
4688   /* In case of reduction chain we switch to the first stmt in the chain, but
4689      we don't update STMT_INFO, since only the last stmt is marked as reduction
4690      and has reduction properties.  */
4691   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4692     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4693
4694   if (nested_in_vect_loop_p (loop, stmt))
4695     {
4696       outer_loop = loop;
4697       loop = loop->inner;
4698       nested_cycle = true;
4699     }
4700
4701   /* 1. Is vectorizable reduction?  */
4702   /* Not supportable if the reduction variable is used in the loop, unless
4703      it's a reduction chain.  */
4704   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4705       && !GROUP_FIRST_ELEMENT (stmt_info))
4706     return false;
4707
4708   /* Reductions that are not used even in an enclosing outer-loop,
4709      are expected to be "live" (used out of the loop).  */
4710   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4711       && !STMT_VINFO_LIVE_P (stmt_info))
4712     return false;
4713
4714   /* Make sure it was already recognized as a reduction computation.  */
4715   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4716       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4717     return false;
4718
4719   /* 2. Has this been recognized as a reduction pattern?
4720
4721      Check if STMT represents a pattern that has been recognized
4722      in earlier analysis stages.  For stmts that represent a pattern,
4723      the STMT_VINFO_RELATED_STMT field records the last stmt in
4724      the original sequence that constitutes the pattern.  */
4725
4726   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4727   if (orig_stmt)
4728     {
4729       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4730       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4731       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4732     }
4733
4734   /* 3. Check the operands of the operation.  The first operands are defined
4735         inside the loop body. The last operand is the reduction variable,
4736         which is defined by the loop-header-phi.  */
4737
4738   gcc_assert (is_gimple_assign (stmt));
4739
4740   /* Flatten RHS.  */
4741   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4742     {
4743     case GIMPLE_SINGLE_RHS:
4744       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4745       if (op_type == ternary_op)
4746         {
4747           tree rhs = gimple_assign_rhs1 (stmt);
4748           ops[0] = TREE_OPERAND (rhs, 0);
4749           ops[1] = TREE_OPERAND (rhs, 1);
4750           ops[2] = TREE_OPERAND (rhs, 2);
4751           code = TREE_CODE (rhs);
4752         }
4753       else
4754         return false;
4755       break;
4756
4757     case GIMPLE_BINARY_RHS:
4758       code = gimple_assign_rhs_code (stmt);
4759       op_type = TREE_CODE_LENGTH (code);
4760       gcc_assert (op_type == binary_op);
4761       ops[0] = gimple_assign_rhs1 (stmt);
4762       ops[1] = gimple_assign_rhs2 (stmt);
4763       break;
4764
4765     case GIMPLE_TERNARY_RHS:
4766       code = gimple_assign_rhs_code (stmt);
4767       op_type = TREE_CODE_LENGTH (code);
4768       gcc_assert (op_type == ternary_op);
4769       ops[0] = gimple_assign_rhs1 (stmt);
4770       ops[1] = gimple_assign_rhs2 (stmt);
4771       ops[2] = gimple_assign_rhs3 (stmt);
4772       break;
4773
4774     case GIMPLE_UNARY_RHS:
4775       return false;
4776
4777     default:
4778       gcc_unreachable ();
4779     }
4780
4781   if (code == COND_EXPR && slp_node)
4782     return false;
4783
4784   scalar_dest = gimple_assign_lhs (stmt);
4785   scalar_type = TREE_TYPE (scalar_dest);
4786   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4787       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4788     return false;
4789
4790   /* Do not try to vectorize bit-precision reductions.  */
4791   if ((TYPE_PRECISION (scalar_type)
4792        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4793     return false;
4794
4795   /* All uses but the last are expected to be defined in the loop.
4796      The last use is the reduction variable.  In case of nested cycle this
4797      assumption is not true: we use reduc_index to record the index of the
4798      reduction variable.  */
4799   for (i = 0; i < op_type - 1; i++)
4800     {
4801       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4802       if (i == 0 && code == COND_EXPR)
4803         continue;
4804
4805       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4806                                             &def_stmt, &def, &dt, &tem);
4807       if (!vectype_in)
4808         vectype_in = tem;
4809       gcc_assert (is_simple_use);
4810
4811       if (dt != vect_internal_def
4812           && dt != vect_external_def
4813           && dt != vect_constant_def
4814           && dt != vect_induction_def
4815           && !(dt == vect_nested_cycle && nested_cycle))
4816         return false;
4817
4818       if (dt == vect_nested_cycle)
4819         {
4820           found_nested_cycle_def = true;
4821           reduc_def_stmt = def_stmt;
4822           reduc_index = i;
4823         }
4824     }
4825
4826   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4827                                         &def_stmt, &def, &dt, &tem);
4828   if (!vectype_in)
4829     vectype_in = tem;
4830   gcc_assert (is_simple_use);
4831   if (!(dt == vect_reduction_def
4832         || dt == vect_nested_cycle
4833         || ((dt == vect_internal_def || dt == vect_external_def
4834              || dt == vect_constant_def || dt == vect_induction_def)
4835             && nested_cycle && found_nested_cycle_def)))
4836     {
4837       /* For pattern recognized stmts, orig_stmt might be a reduction,
4838          but some helper statements for the pattern might not, or
4839          might be COND_EXPRs with reduction uses in the condition.  */
4840       gcc_assert (orig_stmt);
4841       return false;
4842     }
4843   if (!found_nested_cycle_def)
4844     reduc_def_stmt = def_stmt;
4845
4846   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4847   if (orig_stmt)
4848     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4849                                                        reduc_def_stmt,
4850                                                        !nested_cycle,
4851                                                        &dummy));
4852   else
4853     {
4854       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4855                                              !nested_cycle, &dummy);
4856       /* We changed STMT to be the first stmt in reduction chain, hence we
4857          check that in this case the first element in the chain is STMT.  */
4858       gcc_assert (stmt == tmp
4859                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4860     }
4861
4862   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4863     return false;
4864
4865   if (slp_node || PURE_SLP_STMT (stmt_info))
4866     ncopies = 1;
4867   else
4868     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4869                / TYPE_VECTOR_SUBPARTS (vectype_in));
4870
4871   gcc_assert (ncopies >= 1);
4872
4873   vec_mode = TYPE_MODE (vectype_in);
4874
4875   if (code == COND_EXPR)
4876     {
4877       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4878         {
4879           if (dump_enabled_p ())
4880             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4881                              "unsupported condition in reduction\n");
4882
4883             return false;
4884         }
4885     }
4886   else
4887     {
4888       /* 4. Supportable by target?  */
4889
4890       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
4891           || code == LROTATE_EXPR || code == RROTATE_EXPR)
4892         {
4893           /* Shifts and rotates are only supported by vectorizable_shifts,
4894              not vectorizable_reduction.  */
4895           if (dump_enabled_p ())
4896             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4897                              "unsupported shift or rotation.\n");
4898           return false;
4899         }
4900
4901       /* 4.1. check support for the operation in the loop  */
4902       optab = optab_for_tree_code (code, vectype_in, optab_default);
4903       if (!optab)
4904         {
4905           if (dump_enabled_p ())
4906             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4907                              "no optab.\n");
4908
4909           return false;
4910         }
4911
4912       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4913         {
4914           if (dump_enabled_p ())
4915             dump_printf (MSG_NOTE, "op not supported by target.\n");
4916
4917           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4918               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4919                   < vect_min_worthwhile_factor (code))
4920             return false;
4921
4922           if (dump_enabled_p ())
4923             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
4924         }
4925
4926       /* Worthwhile without SIMD support?  */
4927       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4928           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4929              < vect_min_worthwhile_factor (code))
4930         {
4931           if (dump_enabled_p ())
4932             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4933                              "not worthwhile without SIMD support.\n");
4934
4935           return false;
4936         }
4937     }
4938
4939   /* 4.2. Check support for the epilog operation.
4940
4941           If STMT represents a reduction pattern, then the type of the
4942           reduction variable may be different than the type of the rest
4943           of the arguments.  For example, consider the case of accumulation
4944           of shorts into an int accumulator; The original code:
4945                         S1: int_a = (int) short_a;
4946           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
4947
4948           was replaced with:
4949                         STMT: int_acc = widen_sum <short_a, int_acc>
4950
4951           This means that:
4952           1. The tree-code that is used to create the vector operation in the
4953              epilog code (that reduces the partial results) is not the
4954              tree-code of STMT, but is rather the tree-code of the original
4955              stmt from the pattern that STMT is replacing.  I.e, in the example
4956              above we want to use 'widen_sum' in the loop, but 'plus' in the
4957              epilog.
4958           2. The type (mode) we use to check available target support
4959              for the vector operation to be created in the *epilog*, is
4960              determined by the type of the reduction variable (in the example
4961              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4962              However the type (mode) we use to check available target support
4963              for the vector operation to be created *inside the loop*, is
4964              determined by the type of the other arguments to STMT (in the
4965              example we'd check this: optab_handler (widen_sum_optab,
4966              vect_short_mode)).
4967
4968           This is contrary to "regular" reductions, in which the types of all
4969           the arguments are the same as the type of the reduction variable.
4970           For "regular" reductions we can therefore use the same vector type
4971           (and also the same tree-code) when generating the epilog code and
4972           when generating the code inside the loop.  */
4973
4974   if (orig_stmt)
4975     {
4976       /* This is a reduction pattern: get the vectype from the type of the
4977          reduction variable, and get the tree-code from orig_stmt.  */
4978       orig_code = gimple_assign_rhs_code (orig_stmt);
4979       gcc_assert (vectype_out);
4980       vec_mode = TYPE_MODE (vectype_out);
4981     }
4982   else
4983     {
4984       /* Regular reduction: use the same vectype and tree-code as used for
4985          the vector code inside the loop can be used for the epilog code. */
4986       orig_code = code;
4987     }
4988
4989   if (nested_cycle)
4990     {
4991       def_bb = gimple_bb (reduc_def_stmt);
4992       def_stmt_loop = def_bb->loop_father;
4993       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4994                                        loop_preheader_edge (def_stmt_loop));
4995       if (TREE_CODE (def_arg) == SSA_NAME
4996           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4997           && gimple_code (def_arg_stmt) == GIMPLE_PHI
4998           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4999           && vinfo_for_stmt (def_arg_stmt)
5000           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5001               == vect_double_reduction_def)
5002         double_reduc = true;
5003     }
5004
5005   epilog_reduc_code = ERROR_MARK;
5006   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5007     {
5008       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5009                                          optab_default);
5010       if (!reduc_optab)
5011         {
5012           if (dump_enabled_p ())
5013             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5014                              "no optab for reduction.\n");
5015
5016           epilog_reduc_code = ERROR_MARK;
5017         }
5018
5019       if (reduc_optab
5020           && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5021         {
5022           if (dump_enabled_p ())
5023             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5024                              "reduc op not supported by target.\n");
5025
5026           epilog_reduc_code = ERROR_MARK;
5027         }
5028     }
5029   else
5030     {
5031       if (!nested_cycle || double_reduc)
5032         {
5033           if (dump_enabled_p ())
5034             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5035                              "no reduc code for scalar code.\n");
5036
5037           return false;
5038         }
5039     }
5040
5041   if (double_reduc && ncopies > 1)
5042     {
5043       if (dump_enabled_p ())
5044         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5045                          "multiple types in double reduction\n");
5046
5047       return false;
5048     }
5049
5050   /* In case of widenning multiplication by a constant, we update the type
5051      of the constant to be the type of the other operand.  We check that the
5052      constant fits the type in the pattern recognition pass.  */
5053   if (code == DOT_PROD_EXPR
5054       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5055     {
5056       if (TREE_CODE (ops[0]) == INTEGER_CST)
5057         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5058       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5059         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5060       else
5061         {
5062           if (dump_enabled_p ())
5063             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5064                              "invalid types in dot-prod\n");
5065
5066           return false;
5067         }
5068     }
5069
5070   if (!vec_stmt) /* transformation not required.  */
5071     {
5072       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
5073         return false;
5074       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5075       return true;
5076     }
5077
5078   /** Transform.  **/
5079
5080   if (dump_enabled_p ())
5081     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5082
5083   /* FORNOW: Multiple types are not supported for condition.  */
5084   if (code == COND_EXPR)
5085     gcc_assert (ncopies == 1);
5086
5087   /* Create the destination vector  */
5088   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5089
5090   /* In case the vectorization factor (VF) is bigger than the number
5091      of elements that we can fit in a vectype (nunits), we have to generate
5092      more than one vector stmt - i.e - we need to "unroll" the
5093      vector stmt by a factor VF/nunits.  For more details see documentation
5094      in vectorizable_operation.  */
5095
5096   /* If the reduction is used in an outer loop we need to generate
5097      VF intermediate results, like so (e.g. for ncopies=2):
5098         r0 = phi (init, r0)
5099         r1 = phi (init, r1)
5100         r0 = x0 + r0;
5101         r1 = x1 + r1;
5102     (i.e. we generate VF results in 2 registers).
5103     In this case we have a separate def-use cycle for each copy, and therefore
5104     for each copy we get the vector def for the reduction variable from the
5105     respective phi node created for this copy.
5106
5107     Otherwise (the reduction is unused in the loop nest), we can combine
5108     together intermediate results, like so (e.g. for ncopies=2):
5109         r = phi (init, r)
5110         r = x0 + r;
5111         r = x1 + r;
5112    (i.e. we generate VF/2 results in a single register).
5113    In this case for each copy we get the vector def for the reduction variable
5114    from the vectorized reduction operation generated in the previous iteration.
5115   */
5116
5117   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5118     {
5119       single_defuse_cycle = true;
5120       epilog_copies = 1;
5121     }
5122   else
5123     epilog_copies = ncopies;
5124
5125   prev_stmt_info = NULL;
5126   prev_phi_info = NULL;
5127   if (slp_node)
5128     {
5129       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5130       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5131                   == TYPE_VECTOR_SUBPARTS (vectype_in));
5132     }
5133   else
5134     {
5135       vec_num = 1;
5136       vec_oprnds0.create (1);
5137       if (op_type == ternary_op)
5138         vec_oprnds1.create (1);
5139     }
5140
5141   phis.create (vec_num);
5142   vect_defs.create (vec_num);
5143   if (!slp_node)
5144     vect_defs.quick_push (NULL_TREE);
5145
5146   for (j = 0; j < ncopies; j++)
5147     {
5148       if (j == 0 || !single_defuse_cycle)
5149         {
5150           for (i = 0; i < vec_num; i++)
5151             {
5152               /* Create the reduction-phi that defines the reduction
5153                  operand.  */
5154               new_phi = create_phi_node (vec_dest, loop->header);
5155               set_vinfo_for_stmt (new_phi,
5156                                   new_stmt_vec_info (new_phi, loop_vinfo,
5157                                                      NULL));
5158                if (j == 0 || slp_node)
5159                  phis.quick_push (new_phi);
5160             }
5161         }
5162
5163       if (code == COND_EXPR)
5164         {
5165           gcc_assert (!slp_node);
5166           vectorizable_condition (stmt, gsi, vec_stmt,
5167                                   PHI_RESULT (phis[0]),
5168                                   reduc_index, NULL);
5169           /* Multiple types are not supported for condition.  */
5170           break;
5171         }
5172
5173       /* Handle uses.  */
5174       if (j == 0)
5175         {
5176           op0 = ops[!reduc_index];
5177           if (op_type == ternary_op)
5178             {
5179               if (reduc_index == 0)
5180                 op1 = ops[2];
5181               else
5182                 op1 = ops[1];
5183             }
5184
5185           if (slp_node)
5186             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5187                                slp_node, -1);
5188           else
5189             {
5190               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5191                                                             stmt, NULL);
5192               vec_oprnds0.quick_push (loop_vec_def0);
5193               if (op_type == ternary_op)
5194                {
5195                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5196                                                                NULL);
5197                  vec_oprnds1.quick_push (loop_vec_def1);
5198                }
5199             }
5200         }
5201       else
5202         {
5203           if (!slp_node)
5204             {
5205               enum vect_def_type dt;
5206               gimple dummy_stmt;
5207               tree dummy;
5208
5209               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5210                                   &dummy_stmt, &dummy, &dt);
5211               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5212                                                               loop_vec_def0);
5213               vec_oprnds0[0] = loop_vec_def0;
5214               if (op_type == ternary_op)
5215                 {
5216                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5217                                       &dummy, &dt);
5218                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5219                                                                 loop_vec_def1);
5220                   vec_oprnds1[0] = loop_vec_def1;
5221                 }
5222             }
5223
5224           if (single_defuse_cycle)
5225             reduc_def = gimple_assign_lhs (new_stmt);
5226
5227           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5228         }
5229
5230       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5231         {
5232           if (slp_node)
5233             reduc_def = PHI_RESULT (phis[i]);
5234           else
5235             {
5236               if (!single_defuse_cycle || j == 0)
5237                 reduc_def = PHI_RESULT (new_phi);
5238             }
5239
5240           def1 = ((op_type == ternary_op)
5241                   ? vec_oprnds1[i] : NULL);
5242           if (op_type == binary_op)
5243             {
5244               if (reduc_index == 0)
5245                 expr = build2 (code, vectype_out, reduc_def, def0);
5246               else
5247                 expr = build2 (code, vectype_out, def0, reduc_def);
5248             }
5249           else
5250             {
5251               if (reduc_index == 0)
5252                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5253               else
5254                 {
5255                   if (reduc_index == 1)
5256                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5257                   else
5258                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5259                 }
5260             }
5261
5262           new_stmt = gimple_build_assign (vec_dest, expr);
5263           new_temp = make_ssa_name (vec_dest, new_stmt);
5264           gimple_assign_set_lhs (new_stmt, new_temp);
5265           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5266
5267           if (slp_node)
5268             {
5269               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5270               vect_defs.quick_push (new_temp);
5271             }
5272           else
5273             vect_defs[0] = new_temp;
5274         }
5275
5276       if (slp_node)
5277         continue;
5278
5279       if (j == 0)
5280         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5281       else
5282         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5283
5284       prev_stmt_info = vinfo_for_stmt (new_stmt);
5285       prev_phi_info = vinfo_for_stmt (new_phi);
5286     }
5287
5288   /* Finalize the reduction-phi (set its arguments) and create the
5289      epilog reduction code.  */
5290   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5291     {
5292       new_temp = gimple_assign_lhs (*vec_stmt);
5293       vect_defs[0] = new_temp;
5294     }
5295
5296   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5297                                     epilog_reduc_code, phis, reduc_index,
5298                                     double_reduc, slp_node);
5299
5300   phis.release ();
5301   vect_defs.release ();
5302   vec_oprnds0.release ();
5303   vec_oprnds1.release ();
5304
5305   return true;
5306 }
5307
5308 /* Function vect_min_worthwhile_factor.
5309
5310    For a loop where we could vectorize the operation indicated by CODE,
5311    return the minimum vectorization factor that makes it worthwhile
5312    to use generic vectors.  */
5313 int
5314 vect_min_worthwhile_factor (enum tree_code code)
5315 {
5316   switch (code)
5317     {
5318     case PLUS_EXPR:
5319     case MINUS_EXPR:
5320     case NEGATE_EXPR:
5321       return 4;
5322
5323     case BIT_AND_EXPR:
5324     case BIT_IOR_EXPR:
5325     case BIT_XOR_EXPR:
5326     case BIT_NOT_EXPR:
5327       return 2;
5328
5329     default:
5330       return INT_MAX;
5331     }
5332 }
5333
5334
5335 /* Function vectorizable_induction
5336
5337    Check if PHI performs an induction computation that can be vectorized.
5338    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5339    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5340    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5341
5342 bool
5343 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5344                         gimple *vec_stmt)
5345 {
5346   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5347   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5348   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5349   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5350   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5351   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5352   tree vec_def;
5353
5354   gcc_assert (ncopies >= 1);
5355   /* FORNOW. These restrictions should be relaxed.  */
5356   if (nested_in_vect_loop_p (loop, phi))
5357     {
5358       imm_use_iterator imm_iter;
5359       use_operand_p use_p;
5360       gimple exit_phi;
5361       edge latch_e;
5362       tree loop_arg;
5363
5364       if (ncopies > 1)
5365         {
5366           if (dump_enabled_p ())
5367             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5368                              "multiple types in nested loop.\n");
5369           return false;
5370         }
5371
5372       exit_phi = NULL;
5373       latch_e = loop_latch_edge (loop->inner);
5374       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5375       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5376         {
5377           if (!flow_bb_inside_loop_p (loop->inner,
5378                                       gimple_bb (USE_STMT (use_p))))
5379             {
5380               exit_phi = USE_STMT (use_p);
5381               break;
5382             }
5383         }
5384       if (exit_phi)
5385         {
5386           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5387           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5388                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5389             {
5390               if (dump_enabled_p ())
5391                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5392                                  "inner-loop induction only used outside "
5393                                  "of the outer vectorized loop.\n");
5394               return false;
5395             }
5396         }
5397     }
5398
5399   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5400     return false;
5401
5402   /* FORNOW: SLP not supported.  */
5403   if (STMT_SLP_TYPE (stmt_info))
5404     return false;
5405
5406   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5407
5408   if (gimple_code (phi) != GIMPLE_PHI)
5409     return false;
5410
5411   if (!vec_stmt) /* transformation not required.  */
5412     {
5413       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5414       if (dump_enabled_p ())
5415         dump_printf_loc (MSG_NOTE, vect_location,
5416                          "=== vectorizable_induction ===\n");
5417       vect_model_induction_cost (stmt_info, ncopies);
5418       return true;
5419     }
5420
5421   /** Transform.  **/
5422
5423   if (dump_enabled_p ())
5424     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5425
5426   vec_def = get_initial_def_for_induction (phi);
5427   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5428   return true;
5429 }
5430
5431 /* Function vectorizable_live_operation.
5432
5433    STMT computes a value that is used outside the loop.  Check if
5434    it can be supported.  */
5435
5436 bool
5437 vectorizable_live_operation (gimple stmt,
5438                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5439                              gimple *vec_stmt)
5440 {
5441   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5442   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5443   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5444   int i;
5445   int op_type;
5446   tree op;
5447   tree def;
5448   gimple def_stmt;
5449   enum vect_def_type dt;
5450   enum tree_code code;
5451   enum gimple_rhs_class rhs_class;
5452
5453   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5454
5455   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5456     return false;
5457
5458   if (!is_gimple_assign (stmt))
5459     {
5460       if (gimple_call_internal_p (stmt)
5461           && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5462           && gimple_call_lhs (stmt)
5463           && loop->simduid
5464           && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5465           && loop->simduid
5466              == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5467         {
5468           edge e = single_exit (loop);
5469           basic_block merge_bb = e->dest;
5470           imm_use_iterator imm_iter;
5471           use_operand_p use_p;
5472           tree lhs = gimple_call_lhs (stmt);
5473
5474           FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5475             {
5476               gimple use_stmt = USE_STMT (use_p);
5477               if (gimple_code (use_stmt) == GIMPLE_PHI
5478                   || gimple_bb (use_stmt) == merge_bb)
5479                 {
5480                   if (vec_stmt)
5481                     {
5482                       tree vfm1
5483                         = build_int_cst (unsigned_type_node,
5484                                          loop_vinfo->vectorization_factor - 1);
5485                       SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5486                     }
5487                   return true;
5488                 }
5489             }
5490         }
5491
5492       return false;
5493     }
5494
5495   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5496     return false;
5497
5498   /* FORNOW. CHECKME. */
5499   if (nested_in_vect_loop_p (loop, stmt))
5500     return false;
5501
5502   code = gimple_assign_rhs_code (stmt);
5503   op_type = TREE_CODE_LENGTH (code);
5504   rhs_class = get_gimple_rhs_class (code);
5505   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5506   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5507
5508   /* FORNOW: support only if all uses are invariant.  This means
5509      that the scalar operations can remain in place, unvectorized.
5510      The original last scalar value that they compute will be used.  */
5511
5512   for (i = 0; i < op_type; i++)
5513     {
5514       if (rhs_class == GIMPLE_SINGLE_RHS)
5515         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5516       else
5517         op = gimple_op (stmt, i + 1);
5518       if (op
5519           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5520                                   &dt))
5521         {
5522           if (dump_enabled_p ())
5523             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5524                              "use not simple.\n");
5525           return false;
5526         }
5527
5528       if (dt != vect_external_def && dt != vect_constant_def)
5529         return false;
5530     }
5531
5532   /* No transformation is required for the cases we currently support.  */
5533   return true;
5534 }
5535
5536 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5537
5538 static void
5539 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5540 {
5541   ssa_op_iter op_iter;
5542   imm_use_iterator imm_iter;
5543   def_operand_p def_p;
5544   gimple ustmt;
5545
5546   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5547     {
5548       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5549         {
5550           basic_block bb;
5551
5552           if (!is_gimple_debug (ustmt))
5553             continue;
5554
5555           bb = gimple_bb (ustmt);
5556
5557           if (!flow_bb_inside_loop_p (loop, bb))
5558             {
5559               if (gimple_debug_bind_p (ustmt))
5560                 {
5561                   if (dump_enabled_p ())
5562                     dump_printf_loc (MSG_NOTE, vect_location,
5563                                      "killing debug use\n");
5564
5565                   gimple_debug_bind_reset_value (ustmt);
5566                   update_stmt (ustmt);
5567                 }
5568               else
5569                 gcc_unreachable ();
5570             }
5571         }
5572     }
5573 }
5574
5575
5576 /* This function builds ni_name = number of iterations.  Statements
5577    are queued onto SEQ.  */
5578
5579 static tree
5580 vect_build_loop_niters (loop_vec_info loop_vinfo, gimple_seq *seq)
5581 {
5582   tree ni_name, var;
5583   gimple_seq stmts = NULL;
5584   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5585
5586   var = create_tmp_var (TREE_TYPE (ni), "niters");
5587   ni_name = force_gimple_operand (ni, &stmts, false, var);
5588
5589   if (stmts)
5590     gimple_seq_add_seq (seq, stmts);
5591
5592   return ni_name;
5593 }
5594
5595
5596 /* This function generates the following statements:
5597
5598  ni_name = number of iterations loop executes
5599  ratio = ni_name / vf
5600  ratio_mult_vf_name = ratio * vf
5601
5602  and places them in COND_EXPR_STMT_LIST.  */
5603
5604 static void
5605 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5606                                  tree ni_name,
5607                                  tree *ratio_mult_vf_name_ptr,
5608                                  tree *ratio_name_ptr,
5609                                  gimple_seq *cond_expr_stmt_list)
5610 {
5611   gimple_seq stmts;
5612   tree ni_minus_gap_name;
5613   tree var;
5614   tree ratio_name;
5615   tree ratio_mult_vf_name;
5616   tree ni = LOOP_VINFO_NITERS (loop_vinfo);
5617   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5618   tree log_vf;
5619
5620   log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
5621
5622   /* If epilogue loop is required because of data accesses with gaps, we
5623      subtract one iteration from the total number of iterations here for
5624      correct calculation of RATIO.  */
5625   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5626     {
5627       ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5628                                        ni_name,
5629                                        build_one_cst (TREE_TYPE (ni_name)));
5630       if (!is_gimple_val (ni_minus_gap_name))
5631         {
5632           var = create_tmp_var (TREE_TYPE (ni), "ni_gap");
5633
5634           stmts = NULL;
5635           ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5636                                                     true, var);
5637           gimple_seq_add_seq (cond_expr_stmt_list, stmts);
5638         }
5639     }
5640   else
5641     ni_minus_gap_name = ni_name;
5642
5643   /* Create: ratio = ni >> log2(vf) */
5644
5645   ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_minus_gap_name),
5646                             ni_minus_gap_name, log_vf);
5647   if (!is_gimple_val (ratio_name))
5648     {
5649       var = create_tmp_var (TREE_TYPE (ni), "bnd");
5650
5651       stmts = NULL;
5652       ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5653       gimple_seq_add_seq (cond_expr_stmt_list, stmts);
5654     }
5655   *ratio_name_ptr = ratio_name;
5656
5657   /* Create: ratio_mult_vf = ratio << log2 (vf).  */
5658
5659   if (ratio_mult_vf_name_ptr)
5660     {
5661       ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5662                                         ratio_name, log_vf);
5663       if (!is_gimple_val (ratio_mult_vf_name))
5664         {
5665           var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf");
5666
5667           stmts = NULL;
5668           ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5669                                                      true, var);
5670           gimple_seq_add_seq (cond_expr_stmt_list, stmts);
5671         }
5672       *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5673     }
5674
5675   return;
5676 }
5677
5678
5679 /* Function vect_transform_loop.
5680
5681    The analysis phase has determined that the loop is vectorizable.
5682    Vectorize the loop - created vectorized stmts to replace the scalar
5683    stmts in the loop, and update the loop exit condition.  */
5684
5685 void
5686 vect_transform_loop (loop_vec_info loop_vinfo)
5687 {
5688   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5689   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5690   int nbbs = loop->num_nodes;
5691   gimple_stmt_iterator si;
5692   int i;
5693   tree ratio = NULL;
5694   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5695   bool grouped_store;
5696   bool slp_scheduled = false;
5697   unsigned int nunits;
5698   gimple stmt, pattern_stmt;
5699   gimple_seq pattern_def_seq = NULL;
5700   gimple_stmt_iterator pattern_def_si = gsi_none ();
5701   bool transform_pattern_stmt = false;
5702   bool check_profitability = false;
5703   int th;
5704   /* Record number of iterations before we started tampering with the profile. */
5705   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5706
5707   if (dump_enabled_p ())
5708     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5709
5710   /* If profile is inprecise, we have chance to fix it up.  */
5711   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5712     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5713
5714   /* Use the more conservative vectorization threshold.  If the number
5715      of iterations is constant assume the cost check has been performed
5716      by our caller.  If the threshold makes all loops profitable that
5717      run at least the vectorization factor number of times checking
5718      is pointless, too.  */
5719   th = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
5720          * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
5721   th = MAX (th, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo));
5722   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5723       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5724     {
5725       if (dump_enabled_p ())
5726         dump_printf_loc (MSG_NOTE, vect_location,
5727                          "Profitability threshold is %d loop iterations.\n",
5728                          th);
5729       check_profitability = true;
5730     }
5731
5732   /* Version the loop first, if required, so the profitability check
5733      comes first.  */
5734
5735   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5736       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5737     {
5738       vect_loop_versioning (loop_vinfo, th, check_profitability);
5739       check_profitability = false;
5740     }
5741
5742   /* Peel the loop if there are data refs with unknown alignment.
5743      Only one data ref with unknown store is allowed.
5744      This clobbers LOOP_VINFO_NITERS but retains the original
5745      in LOOP_VINFO_NITERS_UNCHANGED.  So we cannot avoid re-computing
5746      niters.  */
5747
5748   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5749     {
5750       gimple_seq stmts = NULL;
5751       tree ni_name = vect_build_loop_niters (loop_vinfo, &stmts);
5752       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
5753       vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5754                                      th, check_profitability);
5755       check_profitability = false;
5756     }
5757
5758   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5759      compile time constant), or it is a constant that doesn't divide by the
5760      vectorization factor, then an epilog loop needs to be created.
5761      We therefore duplicate the loop: the original loop will be vectorized,
5762      and will compute the first (n/VF) iterations.  The second copy of the loop
5763      will remain scalar and will compute the remaining (n%VF) iterations.
5764      (VF is the vectorization factor).  */
5765
5766   if ((int) tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
5767       < exact_log2 (vectorization_factor)
5768       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5769     {
5770       tree ni_name, ratio_mult_vf;
5771       gimple_seq stmts = NULL;
5772       ni_name = vect_build_loop_niters (loop_vinfo, &stmts);
5773       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
5774                                        &ratio, &stmts);
5775       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
5776       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
5777                                       th, check_profitability);
5778     }
5779   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5780     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5781                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5782   else
5783     {
5784       tree ni_name;
5785       gimple_seq stmts = NULL;
5786       ni_name = vect_build_loop_niters (loop_vinfo, &stmts);
5787       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL,
5788                                        &ratio, &stmts);
5789       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
5790     }
5791
5792   /* 1) Make sure the loop header has exactly two entries
5793      2) Make sure we have a preheader basic block.  */
5794
5795   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5796
5797   split_edge (loop_preheader_edge (loop));
5798
5799   /* FORNOW: the vectorizer supports only loops which body consist
5800      of one basic block (header + empty latch). When the vectorizer will
5801      support more involved loop forms, the order by which the BBs are
5802      traversed need to be reconsidered.  */
5803
5804   for (i = 0; i < nbbs; i++)
5805     {
5806       basic_block bb = bbs[i];
5807       stmt_vec_info stmt_info;
5808       gimple phi;
5809
5810       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5811         {
5812           phi = gsi_stmt (si);
5813           if (dump_enabled_p ())
5814             {
5815               dump_printf_loc (MSG_NOTE, vect_location,
5816                                "------>vectorizing phi: ");
5817               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5818               dump_printf (MSG_NOTE, "\n");
5819             }
5820           stmt_info = vinfo_for_stmt (phi);
5821           if (!stmt_info)
5822             continue;
5823
5824           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5825             vect_loop_kill_debug_uses (loop, phi);
5826
5827           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5828               && !STMT_VINFO_LIVE_P (stmt_info))
5829             continue;
5830
5831           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5832                 != (unsigned HOST_WIDE_INT) vectorization_factor)
5833               && dump_enabled_p ())
5834             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
5835
5836           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5837             {
5838               if (dump_enabled_p ())
5839                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
5840               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5841             }
5842         }
5843
5844       pattern_stmt = NULL;
5845       for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5846         {
5847           bool is_store;
5848
5849           if (transform_pattern_stmt)
5850             stmt = pattern_stmt;
5851           else
5852             {
5853               stmt = gsi_stmt (si);
5854               /* During vectorization remove existing clobber stmts.  */
5855               if (gimple_clobber_p (stmt))
5856                 {
5857                   unlink_stmt_vdef (stmt);
5858                   gsi_remove (&si, true);
5859                   release_defs (stmt);
5860                   continue;
5861                 }
5862             }
5863
5864           if (dump_enabled_p ())
5865             {
5866               dump_printf_loc (MSG_NOTE, vect_location,
5867                                "------>vectorizing statement: ");
5868               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5869               dump_printf (MSG_NOTE, "\n");
5870             }
5871
5872           stmt_info = vinfo_for_stmt (stmt);
5873
5874           /* vector stmts created in the outer-loop during vectorization of
5875              stmts in an inner-loop may not have a stmt_info, and do not
5876              need to be vectorized.  */
5877           if (!stmt_info)
5878             {
5879               gsi_next (&si);
5880               continue;
5881             }
5882
5883           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5884             vect_loop_kill_debug_uses (loop, stmt);
5885
5886           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5887               && !STMT_VINFO_LIVE_P (stmt_info))
5888             {
5889               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5890                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5891                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5892                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5893                 {
5894                   stmt = pattern_stmt;
5895                   stmt_info = vinfo_for_stmt (stmt);
5896                 }
5897               else
5898                 {
5899                   gsi_next (&si);
5900                   continue;
5901                 }
5902             }
5903           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5904                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5905                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5906                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5907             transform_pattern_stmt = true;
5908
5909           /* If pattern statement has def stmts, vectorize them too.  */
5910           if (is_pattern_stmt_p (stmt_info))
5911             {
5912               if (pattern_def_seq == NULL)
5913                 {
5914                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
5915                   pattern_def_si = gsi_start (pattern_def_seq);
5916                 }
5917               else if (!gsi_end_p (pattern_def_si))
5918                 gsi_next (&pattern_def_si);
5919               if (pattern_def_seq != NULL)
5920                 {
5921                   gimple pattern_def_stmt = NULL;
5922                   stmt_vec_info pattern_def_stmt_info = NULL;
5923
5924                   while (!gsi_end_p (pattern_def_si))
5925                     {
5926                       pattern_def_stmt = gsi_stmt (pattern_def_si);
5927                       pattern_def_stmt_info
5928                         = vinfo_for_stmt (pattern_def_stmt);
5929                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
5930                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
5931                         break;
5932                       gsi_next (&pattern_def_si);
5933                     }
5934
5935                   if (!gsi_end_p (pattern_def_si))
5936                     {
5937                       if (dump_enabled_p ())
5938                         {
5939                           dump_printf_loc (MSG_NOTE, vect_location,
5940                                            "==> vectorizing pattern def "
5941                                            "stmt: ");
5942                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
5943                                             pattern_def_stmt, 0);
5944                           dump_printf (MSG_NOTE, "\n");
5945                         }
5946
5947                       stmt = pattern_def_stmt;
5948                       stmt_info = pattern_def_stmt_info;
5949                     }
5950                   else
5951                     {
5952                       pattern_def_si = gsi_none ();
5953                       transform_pattern_stmt = false;
5954                     }
5955                 }
5956               else
5957                 transform_pattern_stmt = false;
5958             }
5959
5960           gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5961           nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
5962                                                STMT_VINFO_VECTYPE (stmt_info));
5963           if (!STMT_SLP_TYPE (stmt_info)
5964               && nunits != (unsigned int) vectorization_factor
5965               && dump_enabled_p ())
5966             /* For SLP VF is set according to unrolling factor, and not to
5967                vector size, hence for SLP this print is not valid.  */
5968             dump_printf_loc (MSG_NOTE, vect_location,
5969                              "multiple-types.\n");
5970
5971           /* SLP. Schedule all the SLP instances when the first SLP stmt is
5972              reached.  */
5973           if (STMT_SLP_TYPE (stmt_info))
5974             {
5975               if (!slp_scheduled)
5976                 {
5977                   slp_scheduled = true;
5978
5979                   if (dump_enabled_p ())
5980                     dump_printf_loc (MSG_NOTE, vect_location,
5981                                      "=== scheduling SLP instances ===\n");
5982
5983                   vect_schedule_slp (loop_vinfo, NULL);
5984                 }
5985
5986               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
5987               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5988                 {
5989                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5990                     {
5991                       pattern_def_seq = NULL;
5992                       gsi_next (&si);
5993                     }
5994                   continue;
5995                 }
5996             }
5997
5998           /* -------- vectorize statement ------------ */
5999           if (dump_enabled_p ())
6000             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6001
6002           grouped_store = false;
6003           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6004           if (is_store)
6005             {
6006               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6007                 {
6008                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6009                      interleaving chain was completed - free all the stores in
6010                      the chain.  */
6011                   gsi_next (&si);
6012                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6013                   continue;
6014                 }
6015               else
6016                 {
6017                   /* Free the attached stmt_vec_info and remove the stmt.  */
6018                   gimple store = gsi_stmt (si);
6019                   free_stmt_vec_info (store);
6020                   unlink_stmt_vdef (store);
6021                   gsi_remove (&si, true);
6022                   release_defs (store);
6023                   continue;
6024                 }
6025             }
6026
6027           if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6028             {
6029               pattern_def_seq = NULL;
6030               gsi_next (&si);
6031             }
6032         }                       /* stmts in BB */
6033     }                           /* BBs in loop */
6034
6035   slpeel_make_loop_iterate_ntimes (loop, ratio);
6036
6037   /* Reduce loop iterations by the vectorization factor.  */
6038   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6039                       expected_iterations / vectorization_factor);
6040   loop->nb_iterations_upper_bound
6041     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (vectorization_factor),
6042                                             FLOOR_DIV_EXPR);
6043   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6044       && loop->nb_iterations_upper_bound != double_int_zero)
6045     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - double_int_one;
6046   if (loop->any_estimate)
6047     {
6048       loop->nb_iterations_estimate
6049         = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (vectorization_factor),
6050                                              FLOOR_DIV_EXPR);
6051        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6052            && loop->nb_iterations_estimate != double_int_zero)
6053          loop->nb_iterations_estimate = loop->nb_iterations_estimate - double_int_one;
6054     }
6055
6056   if (dump_enabled_p ())
6057     {
6058       dump_printf_loc (MSG_NOTE, vect_location,
6059                        "LOOP VECTORIZED\n");
6060       if (loop->inner)
6061         dump_printf_loc (MSG_NOTE, vect_location,
6062                          "OUTER LOOP VECTORIZED\n");
6063       dump_printf (MSG_NOTE, "\n");
6064     }
6065 }