gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2013 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "tree.h"
  28 #include "stor-layout.h"
  29 #include "basic-block.h"
  30 #include "gimple-pretty-print.h"
  31 #include "tree-ssa-alias.h"
  32 #include "internal-fn.h"
  33 #include "gimple-expr.h"
  34 #include "is-a.h"
  35 #include "gimple.h"
  36 #include "gimplify.h"
  37 #include "gimple-iterator.h"
  38 #include "gimplify-me.h"
  39 #include "gimple-ssa.h"
  40 #include "tree-phinodes.h"
  41 #include "ssa-iterators.h"
  42 #include "stringpool.h"
  43 #include "tree-ssanames.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop-niter.h"
  47 #include "tree-pass.h"
  48 #include "cfgloop.h"
  49 #include "expr.h"
  50 #include "recog.h"
  51 #include "optabs.h"
  52 #include "params.h"
  53 #include "diagnostic-core.h"
  54 #include "tree-chrec.h"
  55 #include "tree-scalar-evolution.h"
  56 #include "tree-vectorizer.h"
  57 #include "target.h"
  58
  59 /* Loop Vectorization Pass.
  60
  61    This pass tries to vectorize loops.
  62
  63    For example, the vectorizer transforms the following simple loop:
  64
  65         short a[N]; short b[N]; short c[N]; int i;
  66
  67         for (i=0; i<N; i++){
  68           a[i] = b[i] + c[i];
  69         }
  70
  71    as if it was manually vectorized by rewriting the source code into:
  72
  73         typedef int __attribute__((mode(V8HI))) v8hi;
  74         short a[N];  short b[N]; short c[N];   int i;
  75         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  76         v8hi va, vb, vc;
  77
  78         for (i=0; i<N/8; i++){
  79           vb = pb[i];
  80           vc = pc[i];
  81           va = vb + vc;
  82           pa[i] = va;
  83         }
  84
  85         The main entry to this pass is vectorize_loops(), in which
  86    the vectorizer applies a set of analyses on a given set of loops,
  87    followed by the actual vectorization transformation for the loops that
  88    had successfully passed the analysis phase.
  89         Throughout this pass we make a distinction between two types of
  90    data: scalars (which are represented by SSA_NAMES), and memory references
  91    ("data-refs").  These two types of data require different handling both
  92    during analysis and transformation. The types of data-refs that the
  93    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  94    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  95    accesses are required to have a simple (consecutive) access pattern.
  96
  97    Analysis phase:
  98    ===============
  99         The driver for the analysis phase is vect_analyze_loop().
 100    It applies a set of analyses, some of which rely on the scalar evolution
 101    analyzer (scev) developed by Sebastian Pop.
 102
 103         During the analysis phase the vectorizer records some information
 104    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 105    loop, as well as general information about the loop as a whole, which is
 106    recorded in a "loop_vec_info" struct attached to each loop.
 107
 108    Transformation phase:
 109    =====================
 110         The loop transformation phase scans all the stmts in the loop, and
 111    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 112    the loop that needs to be vectorized.  It inserts the vector code sequence
 113    just before the scalar stmt S, and records a pointer to the vector code
 114    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 115    attached to S).  This pointer will be used for the vectorization of following
 116    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 117    otherwise, we rely on dead code elimination for removing it.
 118
 119         For example, say stmt S1 was vectorized into stmt VS1:
 120
 121    VS1: vb = px[i];
 122    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 123    S2:  a = b;
 124
 125    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 126    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 127    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 128    resulting sequence would be:
 129
 130    VS1: vb = px[i];
 131    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 132    VS2: va = vb;
 133    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 134
 135         Operands that are not SSA_NAMEs, are data-refs that appear in
 136    load/store operations (like 'x[i]' in S1), and are handled differently.
 137
 138    Target modeling:
 139    =================
 140         Currently the only target specific information that is used is the
 141    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 142    Targets that can support different sizes of vectors, for now will need
 143    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 144    flexibility will be added in the future.
 145
 146         Since we only vectorize operations which vector form can be
 147    expressed using existing tree codes, to verify that an operation is
 148    supported, the vectorizer checks the relevant optab at the relevant
 149    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 150    the value found is CODE_FOR_nothing, then there's no target support, and
 151    we can't vectorize the stmt.
 152
 153    For additional information on this project see:
 154    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 155 */
 156
 157 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 158
 159 /* Function vect_determine_vectorization_factor
 160
 161    Determine the vectorization factor (VF).  VF is the number of data elements
 162    that are operated upon in parallel in a single iteration of the vectorized
 163    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 164    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 165    elements can fit in a single vector register.
 166
 167    We currently support vectorization of loops in which all types operated upon
 168    are of the same size.  Therefore this function currently sets VF according to
 169    the size of the types operated upon, and fails if there are multiple sizes
 170    in the loop.
 171
 172    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 173    original loop:
 174         for (i=0; i<N; i++){
 175           a[i] = b[i] + c[i];
 176         }
 177
 178    vectorized loop:
 179         for (i=0; i<N; i+=VF){
 180           a[i:VF] = b[i:VF] + c[i:VF];
 181         }
 182 */
 183
 184 static bool
 185 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 186 {
 187   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 188   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 189   int nbbs = loop->num_nodes;
 190   gimple_stmt_iterator si;
 191   unsigned int vectorization_factor = 0;
 192   tree scalar_type;
 193   gimple phi;
 194   tree vectype;
 195   unsigned int nunits;
 196   stmt_vec_info stmt_info;
 197   int i;
 198   HOST_WIDE_INT dummy;
 199   gimple stmt, pattern_stmt = NULL;
 200   gimple_seq pattern_def_seq = NULL;
 201   gimple_stmt_iterator pattern_def_si = gsi_none ();
 202   bool analyze_pattern_stmt = false;
 203
 204   if (dump_enabled_p ())
 205     dump_printf_loc (MSG_NOTE, vect_location,
 206                      "=== vect_determine_vectorization_factor ===\n");
 207
 208   for (i = 0; i < nbbs; i++)
 209     {
 210       basic_block bb = bbs[i];
 211
 212       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 213         {
 214           phi = gsi_stmt (si);
 215           stmt_info = vinfo_for_stmt (phi);
 216           if (dump_enabled_p ())
 217             {
 218               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 219               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 220               dump_printf (MSG_NOTE, "\n");
 221             }
 222
 223           gcc_assert (stmt_info);
 224
 225           if (STMT_VINFO_RELEVANT_P (stmt_info))
 226             {
 227               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 228               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 229
 230               if (dump_enabled_p ())
 231                 {
 232                   dump_printf_loc (MSG_NOTE, vect_location,
 233                                    "get vectype for scalar type:  ");
 234                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 235                   dump_printf (MSG_NOTE, "\n");
 236                 }
 237
 238               vectype = get_vectype_for_scalar_type (scalar_type);
 239               if (!vectype)
 240                 {
 241                   if (dump_enabled_p ())
 242                     {
 243                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 244                                        "not vectorized: unsupported "
 245                                        "data-type ");
 246                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 247                                          scalar_type);
 248                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 249                     }
 250                   return false;
 251                 }
 252               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 253
 254               if (dump_enabled_p ())
 255                 {
 256                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 257                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 258                   dump_printf (MSG_NOTE, "\n");
 259                 }
 260
 261               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 262               if (dump_enabled_p ())
 263                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 264                                  nunits);
 265
 266               if (!vectorization_factor
 267                   || (nunits > vectorization_factor))
 268                 vectorization_factor = nunits;
 269             }
 270         }
 271
 272       for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
 273         {
 274           tree vf_vectype;
 275
 276           if (analyze_pattern_stmt)
 277             stmt = pattern_stmt;
 278           else
 279             stmt = gsi_stmt (si);
 280
 281           stmt_info = vinfo_for_stmt (stmt);
 282
 283           if (dump_enabled_p ())
 284             {
 285               dump_printf_loc (MSG_NOTE, vect_location,
 286                                "==> examining statement: ");
 287               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 288               dump_printf (MSG_NOTE, "\n");
 289             }
 290
 291           gcc_assert (stmt_info);
 292
 293           /* Skip stmts which do not need to be vectorized.  */
 294           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 295                && !STMT_VINFO_LIVE_P (stmt_info))
 296               || gimple_clobber_p (stmt))
 297             {
 298               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 299                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 300                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 301                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 302                 {
 303                   stmt = pattern_stmt;
 304                   stmt_info = vinfo_for_stmt (pattern_stmt);
 305                   if (dump_enabled_p ())
 306                     {
 307                       dump_printf_loc (MSG_NOTE, vect_location,
 308                                        "==> examining pattern statement: ");
 309                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 310                       dump_printf (MSG_NOTE, "\n");
 311                     }
 312                 }
 313               else
 314                 {
 315                   if (dump_enabled_p ())
 316                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 317                   gsi_next (&si);
 318                   continue;
 319                 }
 320             }
 321           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 322                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 323                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 324                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 325             analyze_pattern_stmt = true;
 326
 327           /* If a pattern statement has def stmts, analyze them too.  */
 328           if (is_pattern_stmt_p (stmt_info))
 329             {
 330               if (pattern_def_seq == NULL)
 331                 {
 332                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 333                   pattern_def_si = gsi_start (pattern_def_seq);
 334                 }
 335               else if (!gsi_end_p (pattern_def_si))
 336                 gsi_next (&pattern_def_si);
 337               if (pattern_def_seq != NULL)
 338                 {
 339                   gimple pattern_def_stmt = NULL;
 340                   stmt_vec_info pattern_def_stmt_info = NULL;
 341
 342                   while (!gsi_end_p (pattern_def_si))
 343                     {
 344                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 345                       pattern_def_stmt_info
 346                         = vinfo_for_stmt (pattern_def_stmt);
 347                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 348                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 349                         break;
 350                       gsi_next (&pattern_def_si);
 351                     }
 352
 353                   if (!gsi_end_p (pattern_def_si))
 354                     {
 355                       if (dump_enabled_p ())
 356                         {
 357                           dump_printf_loc (MSG_NOTE, vect_location,
 358                                            "==> examining pattern def stmt: ");
 359                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 360                                             pattern_def_stmt, 0);
 361                           dump_printf (MSG_NOTE, "\n");
 362                         }
 363
 364                       stmt = pattern_def_stmt;
 365                       stmt_info = pattern_def_stmt_info;
 366                     }
 367                   else
 368                     {
 369                       pattern_def_si = gsi_none ();
 370                       analyze_pattern_stmt = false;
 371                     }
 372                 }
 373               else
 374                 analyze_pattern_stmt = false;
 375             }
 376
 377           if (gimple_get_lhs (stmt) == NULL_TREE)
 378             {
 379               if (is_gimple_call (stmt))
 380                 {
 381                   /* Ignore calls with no lhs.  These must be calls to
 382                      #pragma omp simd functions, and what vectorization factor
 383                      it really needs can't be determined until
 384                      vectorizable_simd_clone_call.  */
 385                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 386                     {
 387                       pattern_def_seq = NULL;
 388                       gsi_next (&si);
 389                     }
 390                   continue;
 391                 }
 392               if (dump_enabled_p ())
 393                 {
 394                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 395                                    "not vectorized: irregular stmt.");
 396                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 397                                     0);
 398                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 399                 }
 400               return false;
 401             }
 402
 403           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 404             {
 405               if (dump_enabled_p ())
 406                 {
 407                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 408                                    "not vectorized: vector stmt in loop:");
 409                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 410                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 411                 }
 412               return false;
 413             }
 414
 415           if (STMT_VINFO_VECTYPE (stmt_info))
 416             {
 417               /* The only case when a vectype had been already set is for stmts
 418                  that contain a dataref, or for "pattern-stmts" (stmts
 419                  generated by the vectorizer to represent/replace a certain
 420                  idiom).  */
 421               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 422                           || is_pattern_stmt_p (stmt_info)
 423                           || !gsi_end_p (pattern_def_si));
 424               vectype = STMT_VINFO_VECTYPE (stmt_info);
 425             }
 426           else
 427             {
 428               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 429               scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 430               if (dump_enabled_p ())
 431                 {
 432                   dump_printf_loc (MSG_NOTE, vect_location,
 433                                    "get vectype for scalar type:  ");
 434                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 435                   dump_printf (MSG_NOTE, "\n");
 436                 }
 437               vectype = get_vectype_for_scalar_type (scalar_type);
 438               if (!vectype)
 439                 {
 440                   if (dump_enabled_p ())
 441                     {
 442                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 443                                        "not vectorized: unsupported "
 444                                        "data-type ");
 445                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 446                                          scalar_type);
 447                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 448                     }
 449                   return false;
 450                 }
 451
 452               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 453
 454               if (dump_enabled_p ())
 455                 {
 456                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 457                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 458                   dump_printf (MSG_NOTE, "\n");
 459                 }
 460             }
 461
 462           /* The vectorization factor is according to the smallest
 463              scalar type (or the largest vector size, but we only
 464              support one vector size per loop).  */
 465           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 466                                                        &dummy);
 467           if (dump_enabled_p ())
 468             {
 469               dump_printf_loc (MSG_NOTE, vect_location,
 470                                "get vectype for scalar type:  ");
 471               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 472               dump_printf (MSG_NOTE, "\n");
 473             }
 474           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 475           if (!vf_vectype)
 476             {
 477               if (dump_enabled_p ())
 478                 {
 479                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 480                                    "not vectorized: unsupported data-type ");
 481                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 482                                      scalar_type);
 483                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 484                 }
 485               return false;
 486             }
 487
 488           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 489                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 490             {
 491               if (dump_enabled_p ())
 492                 {
 493                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 494                                    "not vectorized: different sized vector "
 495                                    "types in statement, ");
 496                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 497                                      vectype);
 498                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 499                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 500                                      vf_vectype);
 501                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 502                 }
 503               return false;
 504             }
 505
 506           if (dump_enabled_p ())
 507             {
 508               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 509               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 510               dump_printf (MSG_NOTE, "\n");
 511             }
 512
 513           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 514           if (dump_enabled_p ())
 515             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 516           if (!vectorization_factor
 517               || (nunits > vectorization_factor))
 518             vectorization_factor = nunits;
 519
 520           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 521             {
 522               pattern_def_seq = NULL;
 523               gsi_next (&si);
 524             }
 525         }
 526     }
 527
 528   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 529   if (dump_enabled_p ())
 530     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 531                      vectorization_factor);
 532   if (vectorization_factor <= 1)
 533     {
 534       if (dump_enabled_p ())
 535         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 536                          "not vectorized: unsupported data-type\n");
 537       return false;
 538     }
 539   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 540
 541   return true;
 542 }
 543
 544
 545 /* Function vect_is_simple_iv_evolution.
 546
 547    FORNOW: A simple evolution of an induction variables in the loop is
 548    considered a polynomial evolution.  */
 549
 550 static bool
 551 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 552                              tree * step)
 553 {
 554   tree init_expr;
 555   tree step_expr;
 556   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 557   basic_block bb;
 558
 559   /* When there is no evolution in this loop, the evolution function
 560      is not "simple".  */
 561   if (evolution_part == NULL_TREE)
 562     return false;
 563
 564   /* When the evolution is a polynomial of degree >= 2
 565      the evolution function is not "simple".  */
 566   if (tree_is_chrec (evolution_part))
 567     return false;
 568
 569   step_expr = evolution_part;
 570   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 571
 572   if (dump_enabled_p ())
 573     {
 574       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 575       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 576       dump_printf (MSG_NOTE, ",  init: ");
 577       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 578       dump_printf (MSG_NOTE, "\n");
 579     }
 580
 581   *init = init_expr;
 582   *step = step_expr;
 583
 584   if (TREE_CODE (step_expr) != INTEGER_CST
 585       && (TREE_CODE (step_expr) != SSA_NAME
 586           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 587               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 588           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 589               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 590                   || !flag_associative_math)))
 591       && (TREE_CODE (step_expr) != REAL_CST
 592           || !flag_associative_math))
 593     {
 594       if (dump_enabled_p ())
 595         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 596                          "step unknown.\n");
 597       return false;
 598     }
 599
 600   return true;
 601 }
 602
 603 /* Function vect_analyze_scalar_cycles_1.
 604
 605    Examine the cross iteration def-use cycles of scalar variables
 606    in LOOP.  LOOP_VINFO represents the loop that is now being
 607    considered for vectorization (can be LOOP, or an outer-loop
 608    enclosing LOOP).  */
 609
 610 static void
 611 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 612 {
 613   basic_block bb = loop->header;
 614   tree init, step;
 615   stack_vec<gimple, 64> worklist;
 616   gimple_stmt_iterator gsi;
 617   bool double_reduc;
 618
 619   if (dump_enabled_p ())
 620     dump_printf_loc (MSG_NOTE, vect_location,
 621                      "=== vect_analyze_scalar_cycles ===\n");
 622
 623   /* First - identify all inductions.  Reduction detection assumes that all the
 624      inductions have been identified, therefore, this order must not be
 625      changed.  */
 626   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 627     {
 628       gimple phi = gsi_stmt (gsi);
 629       tree access_fn = NULL;
 630       tree def = PHI_RESULT (phi);
 631       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 632
 633       if (dump_enabled_p ())
 634         {
 635           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 636           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 637           dump_printf (MSG_NOTE, "\n");
 638         }
 639
 640       /* Skip virtual phi's.  The data dependences that are associated with
 641          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 642       if (virtual_operand_p (def))
 643         continue;
 644
 645       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 646
 647       /* Analyze the evolution function.  */
 648       access_fn = analyze_scalar_evolution (loop, def);
 649       if (access_fn)
 650         {
 651           STRIP_NOPS (access_fn);
 652           if (dump_enabled_p ())
 653             {
 654               dump_printf_loc (MSG_NOTE, vect_location,
 655                                "Access function of PHI: ");
 656               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 657               dump_printf (MSG_NOTE, "\n");
 658             }
 659           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 660             = evolution_part_in_loop_num (access_fn, loop->num);
 661         }
 662
 663       if (!access_fn
 664           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 665           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 666               && TREE_CODE (step) != INTEGER_CST))
 667         {
 668           worklist.safe_push (phi);
 669           continue;
 670         }
 671
 672       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 673
 674       if (dump_enabled_p ())
 675         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 676       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 677     }
 678
 679
 680   /* Second - identify all reductions and nested cycles.  */
 681   while (worklist.length () > 0)
 682     {
 683       gimple phi = worklist.pop ();
 684       tree def = PHI_RESULT (phi);
 685       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 686       gimple reduc_stmt;
 687       bool nested_cycle;
 688
 689       if (dump_enabled_p ())
 690         {
 691           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 692           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 693           dump_printf (MSG_NOTE, "\n");
 694         }
 695
 696       gcc_assert (!virtual_operand_p (def)
 697                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 698
 699       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 700       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 701                                                 &double_reduc);
 702       if (reduc_stmt)
 703         {
 704           if (double_reduc)
 705             {
 706               if (dump_enabled_p ())
 707                 dump_printf_loc (MSG_NOTE, vect_location,
 708                                  "Detected double reduction.\n");
 709
 710               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 711               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 712                                                     vect_double_reduction_def;
 713             }
 714           else
 715             {
 716               if (nested_cycle)
 717                 {
 718                   if (dump_enabled_p ())
 719                     dump_printf_loc (MSG_NOTE, vect_location,
 720                                      "Detected vectorizable nested cycle.\n");
 721
 722                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 723                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 724                                                              vect_nested_cycle;
 725                 }
 726               else
 727                 {
 728                   if (dump_enabled_p ())
 729                     dump_printf_loc (MSG_NOTE, vect_location,
 730                                      "Detected reduction.\n");
 731
 732                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 733                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 734                                                            vect_reduction_def;
 735                   /* Store the reduction cycles for possible vectorization in
 736                      loop-aware SLP.  */
 737                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 738                 }
 739             }
 740         }
 741       else
 742         if (dump_enabled_p ())
 743           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 744                            "Unknown def-use cycle pattern.\n");
 745     }
 746 }
 747
 748
 749 /* Function vect_analyze_scalar_cycles.
 750
 751    Examine the cross iteration def-use cycles of scalar variables, by
 752    analyzing the loop-header PHIs of scalar variables.  Classify each
 753    cycle as one of the following: invariant, induction, reduction, unknown.
 754    We do that for the loop represented by LOOP_VINFO, and also to its
 755    inner-loop, if exists.
 756    Examples for scalar cycles:
 757
 758    Example1: reduction:
 759
 760               loop1:
 761               for (i=0; i<N; i++)
 762                  sum += a[i];
 763
 764    Example2: induction:
 765
 766               loop2:
 767               for (i=0; i<N; i++)
 768                  a[i] = i;  */
 769
 770 static void
 771 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 772 {
 773   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 774
 775   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 776
 777   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 778      Reductions in such inner-loop therefore have different properties than
 779      the reductions in the nest that gets vectorized:
 780      1. When vectorized, they are executed in the same order as in the original
 781         scalar loop, so we can't change the order of computation when
 782         vectorizing them.
 783      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 784         current checks are too strict.  */
 785
 786   if (loop->inner)
 787     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 788 }
 789
 790
 791 /* Function vect_get_loop_niters.
 792
 793    Determine how many iterations the loop is executed and place it
 794    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 795    in NUMBER_OF_ITERATIONSM1.
 796
 797    Return the loop exit condition.  */
 798
 799 static gimple
 800 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
 801                       tree *number_of_iterationsm1)
 802 {
 803   tree niters;
 804
 805   if (dump_enabled_p ())
 806     dump_printf_loc (MSG_NOTE, vect_location,
 807                      "=== get_loop_niters ===\n");
 808
 809   niters = number_of_latch_executions (loop);
 810   *number_of_iterationsm1 = niters;
 811
 812   /* We want the number of loop header executions which is the number
 813      of latch executions plus one.
 814      ???  For UINT_MAX latch executions this number overflows to zero
 815      for loops like do { n++; } while (n != 0);  */
 816   if (niters && !chrec_contains_undetermined (niters))
 817     niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
 818                           build_int_cst (TREE_TYPE (niters), 1));
 819   *number_of_iterations = niters;
 820
 821   return get_loop_exit_condition (loop);
 822 }
 823
 824
 825 /* Function bb_in_loop_p
 826
 827    Used as predicate for dfs order traversal of the loop bbs.  */
 828
 829 static bool
 830 bb_in_loop_p (const_basic_block bb, const void *data)
 831 {
 832   const struct loop *const loop = (const struct loop *)data;
 833   if (flow_bb_inside_loop_p (loop, bb))
 834     return true;
 835   return false;
 836 }
 837
 838
 839 /* Function new_loop_vec_info.
 840
 841    Create and initialize a new loop_vec_info struct for LOOP, as well as
 842    stmt_vec_info structs for all the stmts in LOOP.  */
 843
 844 static loop_vec_info
 845 new_loop_vec_info (struct loop *loop)
 846 {
 847   loop_vec_info res;
 848   basic_block *bbs;
 849   gimple_stmt_iterator si;
 850   unsigned int i, nbbs;
 851
 852   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 853   LOOP_VINFO_LOOP (res) = loop;
 854
 855   bbs = get_loop_body (loop);
 856
 857   /* Create/Update stmt_info for all stmts in the loop.  */
 858   for (i = 0; i < loop->num_nodes; i++)
 859     {
 860       basic_block bb = bbs[i];
 861
 862       /* BBs in a nested inner-loop will have been already processed (because
 863          we will have called vect_analyze_loop_form for any nested inner-loop).
 864          Therefore, for stmts in an inner-loop we just want to update the
 865          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 866          loop_info of the outer-loop we are currently considering to vectorize
 867          (instead of the loop_info of the inner-loop).
 868          For stmts in other BBs we need to create a stmt_info from scratch.  */
 869       if (bb->loop_father != loop)
 870         {
 871           /* Inner-loop bb.  */
 872           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 873           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 874             {
 875               gimple phi = gsi_stmt (si);
 876               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 877               loop_vec_info inner_loop_vinfo =
 878                 STMT_VINFO_LOOP_VINFO (stmt_info);
 879               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 880               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 881             }
 882           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 883            {
 884               gimple stmt = gsi_stmt (si);
 885               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 886               loop_vec_info inner_loop_vinfo =
 887                  STMT_VINFO_LOOP_VINFO (stmt_info);
 888               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 889               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 890            }
 891         }
 892       else
 893         {
 894           /* bb in current nest.  */
 895           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 896             {
 897               gimple phi = gsi_stmt (si);
 898               gimple_set_uid (phi, 0);
 899               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 900             }
 901
 902           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 903             {
 904               gimple stmt = gsi_stmt (si);
 905               gimple_set_uid (stmt, 0);
 906               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 907             }
 908         }
 909     }
 910
 911   /* CHECKME: We want to visit all BBs before their successors (except for
 912      latch blocks, for which this assertion wouldn't hold).  In the simple
 913      case of the loop forms we allow, a dfs order of the BBs would the same
 914      as reversed postorder traversal, so we are safe.  */
 915
 916    free (bbs);
 917    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 918    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 919                               bbs, loop->num_nodes, loop);
 920    gcc_assert (nbbs == loop->num_nodes);
 921
 922   LOOP_VINFO_BBS (res) = bbs;
 923   LOOP_VINFO_NITERSM1 (res) = NULL;
 924   LOOP_VINFO_NITERS (res) = NULL;
 925   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 926   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 927   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 928   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
 929   LOOP_VINFO_VECT_FACTOR (res) = 0;
 930   LOOP_VINFO_LOOP_NEST (res).create (3);
 931   LOOP_VINFO_DATAREFS (res).create (10);
 932   LOOP_VINFO_DDRS (res).create (10 * 10);
 933   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 934   LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
 935              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 936   LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
 937              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 938   LOOP_VINFO_GROUPED_STORES (res).create (10);
 939   LOOP_VINFO_REDUCTIONS (res).create (10);
 940   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
 941   LOOP_VINFO_SLP_INSTANCES (res).create (10);
 942   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 943   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
 944   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 945   LOOP_VINFO_PEELING_FOR_NITER (res) = false;
 946   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
 947
 948   return res;
 949 }
 950
 951
 952 /* Function destroy_loop_vec_info.
 953
 954    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 955    stmts in the loop.  */
 956
 957 void
 958 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 959 {
 960   struct loop *loop;
 961   basic_block *bbs;
 962   int nbbs;
 963   gimple_stmt_iterator si;
 964   int j;
 965   vec<slp_instance> slp_instances;
 966   slp_instance instance;
 967   bool swapped;
 968
 969   if (!loop_vinfo)
 970     return;
 971
 972   loop = LOOP_VINFO_LOOP (loop_vinfo);
 973
 974   bbs = LOOP_VINFO_BBS (loop_vinfo);
 975   nbbs = clean_stmts ? loop->num_nodes : 0;
 976   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
 977
 978   for (j = 0; j < nbbs; j++)
 979     {
 980       basic_block bb = bbs[j];
 981       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 982         free_stmt_vec_info (gsi_stmt (si));
 983
 984       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 985         {
 986           gimple stmt = gsi_stmt (si);
 987
 988           /* We may have broken canonical form by moving a constant
 989              into RHS1 of a commutative op.  Fix such occurrences.  */
 990           if (swapped && is_gimple_assign (stmt))
 991             {
 992               enum tree_code code = gimple_assign_rhs_code (stmt);
 993
 994               if ((code == PLUS_EXPR
 995                    || code == POINTER_PLUS_EXPR
 996                    || code == MULT_EXPR)
 997                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 998                 swap_ssa_operands (stmt,
 999                                    gimple_assign_rhs1_ptr (stmt),
1000                                    gimple_assign_rhs2_ptr (stmt));
1001             }
1002
1003           /* Free stmt_vec_info.  */
1004           free_stmt_vec_info (stmt);
1005           gsi_next (&si);
1006         }
1007     }
1008
1009   free (LOOP_VINFO_BBS (loop_vinfo));
1010   vect_destroy_datarefs (loop_vinfo, NULL);
1011   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1012   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1013   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1014   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1015   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1016   FOR_EACH_VEC_ELT (slp_instances, j, instance)
1017     vect_free_slp_instance (instance);
1018
1019   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1020   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1021   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1022   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1023
1024   if (LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
1025     LOOP_VINFO_PEELING_HTAB (loop_vinfo).dispose ();
1026
1027   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1028
1029   free (loop_vinfo);
1030   loop->aux = NULL;
1031 }
1032
1033
1034 /* Function vect_analyze_loop_1.
1035
1036    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1037    for it. The different analyses will record information in the
1038    loop_vec_info struct.  This is a subset of the analyses applied in
1039    vect_analyze_loop, to be applied on an inner-loop nested in the loop
1040    that is now considered for (outer-loop) vectorization.  */
1041
1042 static loop_vec_info
1043 vect_analyze_loop_1 (struct loop *loop)
1044 {
1045   loop_vec_info loop_vinfo;
1046
1047   if (dump_enabled_p ())
1048     dump_printf_loc (MSG_NOTE, vect_location,
1049                      "===== analyze_loop_nest_1 =====\n");
1050
1051   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1052
1053   loop_vinfo = vect_analyze_loop_form (loop);
1054   if (!loop_vinfo)
1055     {
1056       if (dump_enabled_p ())
1057         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1058                          "bad inner-loop form.\n");
1059       return NULL;
1060     }
1061
1062   return loop_vinfo;
1063 }
1064
1065
1066 /* Function vect_analyze_loop_form.
1067
1068    Verify that certain CFG restrictions hold, including:
1069    - the loop has a pre-header
1070    - the loop has a single entry and exit
1071    - the loop exit condition is simple enough, and the number of iterations
1072      can be analyzed (a countable loop).  */
1073
1074 loop_vec_info
1075 vect_analyze_loop_form (struct loop *loop)
1076 {
1077   loop_vec_info loop_vinfo;
1078   gimple loop_cond;
1079   tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
1080   loop_vec_info inner_loop_vinfo = NULL;
1081
1082   if (dump_enabled_p ())
1083     dump_printf_loc (MSG_NOTE, vect_location,
1084                      "=== vect_analyze_loop_form ===\n");
1085
1086   /* Different restrictions apply when we are considering an inner-most loop,
1087      vs. an outer (nested) loop.
1088      (FORNOW. May want to relax some of these restrictions in the future).  */
1089
1090   if (!loop->inner)
1091     {
1092       /* Inner-most loop.  We currently require that the number of BBs is
1093          exactly 2 (the header and latch).  Vectorizable inner-most loops
1094          look like this:
1095
1096                         (pre-header)
1097                            |
1098                           header <--------+
1099                            | |            |
1100                            | +--> latch --+
1101                            |
1102                         (exit-bb)  */
1103
1104       if (loop->num_nodes != 2)
1105         {
1106           if (dump_enabled_p ())
1107             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1108                              "not vectorized: control flow in loop.\n");
1109           return NULL;
1110         }
1111
1112       if (empty_block_p (loop->header))
1113         {
1114           if (dump_enabled_p ())
1115             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1116                              "not vectorized: empty loop.\n");
1117           return NULL;
1118         }
1119     }
1120   else
1121     {
1122       struct loop *innerloop = loop->inner;
1123       edge entryedge;
1124
1125       /* Nested loop. We currently require that the loop is doubly-nested,
1126          contains a single inner loop, and the number of BBs is exactly 5.
1127          Vectorizable outer-loops look like this:
1128
1129                         (pre-header)
1130                            |
1131                           header <---+
1132                            |         |
1133                           inner-loop |
1134                            |         |
1135                           tail ------+
1136                            |
1137                         (exit-bb)
1138
1139          The inner-loop has the properties expected of inner-most loops
1140          as described above.  */
1141
1142       if ((loop->inner)->inner || (loop->inner)->next)
1143         {
1144           if (dump_enabled_p ())
1145             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1146                              "not vectorized: multiple nested loops.\n");
1147           return NULL;
1148         }
1149
1150       /* Analyze the inner-loop.  */
1151       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1152       if (!inner_loop_vinfo)
1153         {
1154           if (dump_enabled_p ())
1155             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1156                              "not vectorized: Bad inner loop.\n");
1157           return NULL;
1158         }
1159
1160       if (!expr_invariant_in_loop_p (loop,
1161                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1162         {
1163           if (dump_enabled_p ())
1164             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1165                              "not vectorized: inner-loop count not"
1166                              " invariant.\n");
1167           destroy_loop_vec_info (inner_loop_vinfo, true);
1168           return NULL;
1169         }
1170
1171       if (loop->num_nodes != 5)
1172         {
1173           if (dump_enabled_p ())
1174             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1175                              "not vectorized: control flow in loop.\n");
1176           destroy_loop_vec_info (inner_loop_vinfo, true);
1177           return NULL;
1178         }
1179
1180       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1181       entryedge = EDGE_PRED (innerloop->header, 0);
1182       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1183         entryedge = EDGE_PRED (innerloop->header, 1);
1184
1185       if (entryedge->src != loop->header
1186           || !single_exit (innerloop)
1187           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1188         {
1189           if (dump_enabled_p ())
1190             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1191                              "not vectorized: unsupported outerloop form.\n");
1192           destroy_loop_vec_info (inner_loop_vinfo, true);
1193           return NULL;
1194         }
1195
1196       if (dump_enabled_p ())
1197         dump_printf_loc (MSG_NOTE, vect_location,
1198                          "Considering outer-loop vectorization.\n");
1199     }
1200
1201   if (!single_exit (loop)
1202       || EDGE_COUNT (loop->header->preds) != 2)
1203     {
1204       if (dump_enabled_p ())
1205         {
1206           if (!single_exit (loop))
1207             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1208                              "not vectorized: multiple exits.\n");
1209           else if (EDGE_COUNT (loop->header->preds) != 2)
1210             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1211                              "not vectorized: too many incoming edges.\n");
1212         }
1213       if (inner_loop_vinfo)
1214         destroy_loop_vec_info (inner_loop_vinfo, true);
1215       return NULL;
1216     }
1217
1218   /* We assume that the loop exit condition is at the end of the loop. i.e,
1219      that the loop is represented as a do-while (with a proper if-guard
1220      before the loop if needed), where the loop header contains all the
1221      executable statements, and the latch is empty.  */
1222   if (!empty_block_p (loop->latch)
1223       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1224     {
1225       if (dump_enabled_p ())
1226         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1227                          "not vectorized: latch block not empty.\n");
1228       if (inner_loop_vinfo)
1229         destroy_loop_vec_info (inner_loop_vinfo, true);
1230       return NULL;
1231     }
1232
1233   /* Make sure there exists a single-predecessor exit bb:  */
1234   if (!single_pred_p (single_exit (loop)->dest))
1235     {
1236       edge e = single_exit (loop);
1237       if (!(e->flags & EDGE_ABNORMAL))
1238         {
1239           split_loop_exit_edge (e);
1240           if (dump_enabled_p ())
1241             dump_printf (MSG_NOTE, "split exit edge.\n");
1242         }
1243       else
1244         {
1245           if (dump_enabled_p ())
1246             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1247                              "not vectorized: abnormal loop exit edge.\n");
1248           if (inner_loop_vinfo)
1249             destroy_loop_vec_info (inner_loop_vinfo, true);
1250           return NULL;
1251         }
1252     }
1253
1254   loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
1255                                     &number_of_iterationsm1);
1256   if (!loop_cond)
1257     {
1258       if (dump_enabled_p ())
1259         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1260                          "not vectorized: complicated exit condition.\n");
1261       if (inner_loop_vinfo)
1262         destroy_loop_vec_info (inner_loop_vinfo, true);
1263       return NULL;
1264     }
1265
1266   if (!number_of_iterations
1267       || chrec_contains_undetermined (number_of_iterations))
1268     {
1269       if (dump_enabled_p ())
1270         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1271                          "not vectorized: number of iterations cannot be "
1272                          "computed.\n");
1273       if (inner_loop_vinfo)
1274         destroy_loop_vec_info (inner_loop_vinfo, true);
1275       return NULL;
1276     }
1277
1278   if (integer_zerop (number_of_iterations))
1279     {
1280       if (dump_enabled_p ())
1281         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1282                          "not vectorized: number of iterations = 0.\n");
1283       if (inner_loop_vinfo)
1284         destroy_loop_vec_info (inner_loop_vinfo, true);
1285       return NULL;
1286     }
1287
1288   loop_vinfo = new_loop_vec_info (loop);
1289   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1290   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1291   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1292
1293   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1294     {
1295       if (dump_enabled_p ())
1296         {
1297           dump_printf_loc (MSG_NOTE, vect_location,
1298                            "Symbolic number of iterations is ");
1299           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1300           dump_printf (MSG_NOTE, "\n");
1301         }
1302     }
1303
1304   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1305
1306   /* CHECKME: May want to keep it around it in the future.  */
1307   if (inner_loop_vinfo)
1308     destroy_loop_vec_info (inner_loop_vinfo, false);
1309
1310   gcc_assert (!loop->aux);
1311   loop->aux = loop_vinfo;
1312   return loop_vinfo;
1313 }
1314
1315
1316 /* Function vect_analyze_loop_operations.
1317
1318    Scan the loop stmts and make sure they are all vectorizable.  */
1319
1320 static bool
1321 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1322 {
1323   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1324   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1325   int nbbs = loop->num_nodes;
1326   gimple_stmt_iterator si;
1327   unsigned int vectorization_factor = 0;
1328   int i;
1329   gimple phi;
1330   stmt_vec_info stmt_info;
1331   bool need_to_vectorize = false;
1332   int min_profitable_iters;
1333   int min_scalar_loop_bound;
1334   unsigned int th;
1335   bool only_slp_in_loop = true, ok;
1336   HOST_WIDE_INT max_niter;
1337   HOST_WIDE_INT estimated_niter;
1338   int min_profitable_estimate;
1339
1340   if (dump_enabled_p ())
1341     dump_printf_loc (MSG_NOTE, vect_location,
1342                      "=== vect_analyze_loop_operations ===\n");
1343
1344   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1345   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1346   if (slp)
1347     {
1348       /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1349          vectorization factor of the loop is the unrolling factor required by
1350          the SLP instances.  If that unrolling factor is 1, we say, that we
1351          perform pure SLP on loop - cross iteration parallelism is not
1352          exploited.  */
1353       for (i = 0; i < nbbs; i++)
1354         {
1355           basic_block bb = bbs[i];
1356           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1357             {
1358               gimple stmt = gsi_stmt (si);
1359               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1360               gcc_assert (stmt_info);
1361               if ((STMT_VINFO_RELEVANT_P (stmt_info)
1362                    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1363                   && !PURE_SLP_STMT (stmt_info))
1364                 /* STMT needs both SLP and loop-based vectorization.  */
1365                 only_slp_in_loop = false;
1366             }
1367         }
1368
1369       if (only_slp_in_loop)
1370         vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1371       else
1372         vectorization_factor = least_common_multiple (vectorization_factor,
1373                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1374
1375       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1376       if (dump_enabled_p ())
1377         dump_printf_loc (MSG_NOTE, vect_location,
1378                          "Updating vectorization factor to %d\n",
1379                          vectorization_factor);
1380     }
1381
1382   for (i = 0; i < nbbs; i++)
1383     {
1384       basic_block bb = bbs[i];
1385
1386       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1387         {
1388           phi = gsi_stmt (si);
1389           ok = true;
1390
1391           stmt_info = vinfo_for_stmt (phi);
1392           if (dump_enabled_p ())
1393             {
1394               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1395               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1396               dump_printf (MSG_NOTE, "\n");
1397             }
1398
1399           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1400              (i.e., a phi in the tail of the outer-loop).  */
1401           if (! is_loop_header_bb_p (bb))
1402             {
1403               /* FORNOW: we currently don't support the case that these phis
1404                  are not used in the outerloop (unless it is double reduction,
1405                  i.e., this phi is vect_reduction_def), cause this case
1406                  requires to actually do something here.  */
1407               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1408                    || STMT_VINFO_LIVE_P (stmt_info))
1409                   && STMT_VINFO_DEF_TYPE (stmt_info)
1410                      != vect_double_reduction_def)
1411                 {
1412                   if (dump_enabled_p ())
1413                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1414                                      "Unsupported loop-closed phi in "
1415                                      "outer-loop.\n");
1416                   return false;
1417                 }
1418
1419               /* If PHI is used in the outer loop, we check that its operand
1420                  is defined in the inner loop.  */
1421               if (STMT_VINFO_RELEVANT_P (stmt_info))
1422                 {
1423                   tree phi_op;
1424                   gimple op_def_stmt;
1425
1426                   if (gimple_phi_num_args (phi) != 1)
1427                     return false;
1428
1429                   phi_op = PHI_ARG_DEF (phi, 0);
1430                   if (TREE_CODE (phi_op) != SSA_NAME)
1431                     return false;
1432
1433                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1434                   if (gimple_nop_p (op_def_stmt)
1435                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1436                       || !vinfo_for_stmt (op_def_stmt))
1437                     return false;
1438
1439                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1440                         != vect_used_in_outer
1441                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1442                            != vect_used_in_outer_by_reduction)
1443                     return false;
1444                 }
1445
1446               continue;
1447             }
1448
1449           gcc_assert (stmt_info);
1450
1451           if (STMT_VINFO_LIVE_P (stmt_info))
1452             {
1453               /* FORNOW: not yet supported.  */
1454               if (dump_enabled_p ())
1455                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1456                                  "not vectorized: value used after loop.\n");
1457               return false;
1458             }
1459
1460           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1461               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1462             {
1463               /* A scalar-dependence cycle that we don't support.  */
1464               if (dump_enabled_p ())
1465                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1466                                  "not vectorized: scalar dependence cycle.\n");
1467               return false;
1468             }
1469
1470           if (STMT_VINFO_RELEVANT_P (stmt_info))
1471             {
1472               need_to_vectorize = true;
1473               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1474                 ok = vectorizable_induction (phi, NULL, NULL);
1475             }
1476
1477           if (!ok)
1478             {
1479               if (dump_enabled_p ())
1480                 {
1481                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1482                                    "not vectorized: relevant phi not "
1483                                    "supported: ");
1484                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1485                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1486                 }
1487               return false;
1488             }
1489         }
1490
1491       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1492         {
1493           gimple stmt = gsi_stmt (si);
1494           if (!gimple_clobber_p (stmt)
1495               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1496             return false;
1497         }
1498     } /* bbs */
1499
1500   /* All operations in the loop are either irrelevant (deal with loop
1501      control, or dead), or only used outside the loop and can be moved
1502      out of the loop (e.g. invariants, inductions).  The loop can be
1503      optimized away by scalar optimizations.  We're better off not
1504      touching this loop.  */
1505   if (!need_to_vectorize)
1506     {
1507       if (dump_enabled_p ())
1508         dump_printf_loc (MSG_NOTE, vect_location,
1509                          "All the computation can be taken out of the loop.\n");
1510       if (dump_enabled_p ())
1511         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1512                          "not vectorized: redundant loop. no profit to "
1513                          "vectorize.\n");
1514       return false;
1515     }
1516
1517   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1518     dump_printf_loc (MSG_NOTE, vect_location,
1519                      "vectorization_factor = %d, niters = "
1520                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1521                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1522
1523   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1524        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1525       || ((max_niter = max_stmt_executions_int (loop)) != -1
1526           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1527     {
1528       if (dump_enabled_p ())
1529         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1530                          "not vectorized: iteration count too small.\n");
1531       if (dump_enabled_p ())
1532         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1533                          "not vectorized: iteration count smaller than "
1534                          "vectorization factor.\n");
1535       return false;
1536     }
1537
1538   /* Analyze cost.  Decide if worth while to vectorize.  */
1539
1540   /* Once VF is set, SLP costs should be updated since the number of created
1541      vector stmts depends on VF.  */
1542   vect_update_slp_costs_according_to_vf (loop_vinfo);
1543
1544   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1545                                       &min_profitable_estimate);
1546   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1547
1548   if (min_profitable_iters < 0)
1549     {
1550       if (dump_enabled_p ())
1551         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1552                          "not vectorized: vectorization not profitable.\n");
1553       if (dump_enabled_p ())
1554         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1555                          "not vectorized: vector version will never be "
1556                          "profitable.\n");
1557       return false;
1558     }
1559
1560   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1561                             * vectorization_factor) - 1);
1562
1563
1564   /* Use the cost model only if it is more conservative than user specified
1565      threshold.  */
1566
1567   th = (unsigned) min_scalar_loop_bound;
1568   if (min_profitable_iters
1569       && (!min_scalar_loop_bound
1570           || min_profitable_iters > min_scalar_loop_bound))
1571     th = (unsigned) min_profitable_iters;
1572
1573   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1574       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1575     {
1576       if (dump_enabled_p ())
1577         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1578                          "not vectorized: vectorization not profitable.\n");
1579       if (dump_enabled_p ())
1580         dump_printf_loc (MSG_NOTE, vect_location,
1581                          "not vectorized: iteration count smaller than user "
1582                          "specified loop bound parameter or minimum profitable "
1583                          "iterations (whichever is more conservative).\n");
1584       return false;
1585     }
1586
1587   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1588       && ((unsigned HOST_WIDE_INT) estimated_niter
1589           <= MAX (th, (unsigned)min_profitable_estimate)))
1590     {
1591       if (dump_enabled_p ())
1592         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1593                          "not vectorized: estimated iteration count too "
1594                          "small.\n");
1595       if (dump_enabled_p ())
1596         dump_printf_loc (MSG_NOTE, vect_location,
1597                          "not vectorized: estimated iteration count smaller "
1598                          "than specified loop bound parameter or minimum "
1599                          "profitable iterations (whichever is more "
1600                          "conservative).\n");
1601       return false;
1602     }
1603
1604   return true;
1605 }
1606
1607
1608 /* Function vect_analyze_loop_2.
1609
1610    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1611    for it.  The different analyses will record information in the
1612    loop_vec_info struct.  */
1613 static bool
1614 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1615 {
1616   bool ok, slp = false;
1617   int max_vf = MAX_VECTORIZATION_FACTOR;
1618   int min_vf = 2;
1619
1620   /* Find all data references in the loop (which correspond to vdefs/vuses)
1621      and analyze their evolution in the loop.  Also adjust the minimal
1622      vectorization factor according to the loads and stores.
1623
1624      FORNOW: Handle only simple, array references, which
1625      alignment can be forced, and aligned pointer-references.  */
1626
1627   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1628   if (!ok)
1629     {
1630       if (dump_enabled_p ())
1631         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1632                          "bad data references.\n");
1633       return false;
1634     }
1635
1636   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1637      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1638
1639   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1640   if (!ok)
1641     {
1642       if (dump_enabled_p ())
1643         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1644                          "bad data access.\n");
1645       return false;
1646     }
1647
1648   /* Classify all cross-iteration scalar data-flow cycles.
1649      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1650
1651   vect_analyze_scalar_cycles (loop_vinfo);
1652
1653   vect_pattern_recog (loop_vinfo, NULL);
1654
1655   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1656
1657   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1658   if (!ok)
1659     {
1660       if (dump_enabled_p ())
1661         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1662                          "unexpected pattern.\n");
1663       return false;
1664     }
1665
1666   /* Analyze data dependences between the data-refs in the loop
1667      and adjust the maximum vectorization factor according to
1668      the dependences.
1669      FORNOW: fail at the first data dependence that we encounter.  */
1670
1671   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1672   if (!ok
1673       || max_vf < min_vf)
1674     {
1675       if (dump_enabled_p ())
1676             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1677                              "bad data dependence.\n");
1678       return false;
1679     }
1680
1681   ok = vect_determine_vectorization_factor (loop_vinfo);
1682   if (!ok)
1683     {
1684       if (dump_enabled_p ())
1685         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1686                          "can't determine vectorization factor.\n");
1687       return false;
1688     }
1689   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1690     {
1691       if (dump_enabled_p ())
1692         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1693                          "bad data dependence.\n");
1694       return false;
1695     }
1696
1697   /* Analyze the alignment of the data-refs in the loop.
1698      Fail if a data reference is found that cannot be vectorized.  */
1699
1700   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1701   if (!ok)
1702     {
1703       if (dump_enabled_p ())
1704         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1705                          "bad data alignment.\n");
1706       return false;
1707     }
1708
1709   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1710      It is important to call pruning after vect_analyze_data_ref_accesses,
1711      since we use grouping information gathered by interleaving analysis.  */
1712   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1713   if (!ok)
1714     {
1715       if (dump_enabled_p ())
1716         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1717                          "too long list of versioning for alias "
1718                          "run-time tests.\n");
1719       return false;
1720     }
1721
1722   /* This pass will decide on using loop versioning and/or loop peeling in
1723      order to enhance the alignment of data references in the loop.  */
1724
1725   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1726   if (!ok)
1727     {
1728       if (dump_enabled_p ())
1729         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1730                          "bad data alignment.\n");
1731       return false;
1732     }
1733
1734   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1735   ok = vect_analyze_slp (loop_vinfo, NULL);
1736   if (ok)
1737     {
1738       /* Decide which possible SLP instances to SLP.  */
1739       slp = vect_make_slp_decision (loop_vinfo);
1740
1741       /* Find stmts that need to be both vectorized and SLPed.  */
1742       vect_detect_hybrid_slp (loop_vinfo);
1743     }
1744   else
1745     return false;
1746
1747   /* Scan all the operations in the loop and make sure they are
1748      vectorizable.  */
1749
1750   ok = vect_analyze_loop_operations (loop_vinfo, slp);
1751   if (!ok)
1752     {
1753       if (dump_enabled_p ())
1754         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1755                          "bad operation or unsupported loop bound.\n");
1756       return false;
1757     }
1758
1759   /* Decide whether we need to create an epilogue loop to handle
1760      remaining scalar iterations.  */
1761   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1762       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1763     {
1764       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1765                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1766           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1767         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1768     }
1769   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1770            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1771                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
1772     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1773
1774   /* If an epilogue loop is required make sure we can create one.  */
1775   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1776       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1777     {
1778       if (dump_enabled_p ())
1779         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1780       if (!vect_can_advance_ivs_p (loop_vinfo)
1781           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1782                                            single_exit (LOOP_VINFO_LOOP
1783                                                          (loop_vinfo))))
1784         {
1785           if (dump_enabled_p ())
1786             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1787                              "not vectorized: can't create required "
1788                              "epilog loop\n");
1789           return false;
1790         }
1791     }
1792
1793   return true;
1794 }
1795
1796 /* Function vect_analyze_loop.
1797
1798    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1799    for it.  The different analyses will record information in the
1800    loop_vec_info struct.  */
1801 loop_vec_info
1802 vect_analyze_loop (struct loop *loop)
1803 {
1804   loop_vec_info loop_vinfo;
1805   unsigned int vector_sizes;
1806
1807   /* Autodetect first vector size we try.  */
1808   current_vector_size = 0;
1809   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1810
1811   if (dump_enabled_p ())
1812     dump_printf_loc (MSG_NOTE, vect_location,
1813                      "===== analyze_loop_nest =====\n");
1814
1815   if (loop_outer (loop)
1816       && loop_vec_info_for_loop (loop_outer (loop))
1817       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1818     {
1819       if (dump_enabled_p ())
1820         dump_printf_loc (MSG_NOTE, vect_location,
1821                          "outer-loop already vectorized.\n");
1822       return NULL;
1823     }
1824
1825   while (1)
1826     {
1827       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1828       loop_vinfo = vect_analyze_loop_form (loop);
1829       if (!loop_vinfo)
1830         {
1831           if (dump_enabled_p ())
1832             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1833                              "bad loop form.\n");
1834           return NULL;
1835         }
1836
1837       if (vect_analyze_loop_2 (loop_vinfo))
1838         {
1839           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1840
1841           return loop_vinfo;
1842         }
1843
1844       destroy_loop_vec_info (loop_vinfo, true);
1845
1846       vector_sizes &= ~current_vector_size;
1847       if (vector_sizes == 0
1848           || current_vector_size == 0)
1849         return NULL;
1850
1851       /* Try the next biggest vector size.  */
1852       current_vector_size = 1 << floor_log2 (vector_sizes);
1853       if (dump_enabled_p ())
1854         dump_printf_loc (MSG_NOTE, vect_location,
1855                          "***** Re-trying analysis with "
1856                          "vector size %d\n", current_vector_size);
1857     }
1858 }
1859
1860
1861 /* Function reduction_code_for_scalar_code
1862
1863    Input:
1864    CODE - tree_code of a reduction operations.
1865
1866    Output:
1867    REDUC_CODE - the corresponding tree-code to be used to reduce the
1868       vector of partial results into a single scalar result (which
1869       will also reside in a vector) or ERROR_MARK if the operation is
1870       a supported reduction operation, but does not have such tree-code.
1871
1872    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1873
1874 static bool
1875 reduction_code_for_scalar_code (enum tree_code code,
1876                                 enum tree_code *reduc_code)
1877 {
1878   switch (code)
1879     {
1880       case MAX_EXPR:
1881         *reduc_code = REDUC_MAX_EXPR;
1882         return true;
1883
1884       case MIN_EXPR:
1885         *reduc_code = REDUC_MIN_EXPR;
1886         return true;
1887
1888       case PLUS_EXPR:
1889         *reduc_code = REDUC_PLUS_EXPR;
1890         return true;
1891
1892       case MULT_EXPR:
1893       case MINUS_EXPR:
1894       case BIT_IOR_EXPR:
1895       case BIT_XOR_EXPR:
1896       case BIT_AND_EXPR:
1897         *reduc_code = ERROR_MARK;
1898         return true;
1899
1900       default:
1901        return false;
1902     }
1903 }
1904
1905
1906 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1907    STMT is printed with a message MSG. */
1908
1909 static void
1910 report_vect_op (int msg_type, gimple stmt, const char *msg)
1911 {
1912   dump_printf_loc (msg_type, vect_location, "%s", msg);
1913   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1914   dump_printf (msg_type, "\n");
1915 }
1916
1917
1918 /* Detect SLP reduction of the form:
1919
1920    #a1 = phi <a5, a0>
1921    a2 = operation (a1)
1922    a3 = operation (a2)
1923    a4 = operation (a3)
1924    a5 = operation (a4)
1925
1926    #a = phi <a5>
1927
1928    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1929    FIRST_STMT is the first reduction stmt in the chain
1930    (a2 = operation (a1)).
1931
1932    Return TRUE if a reduction chain was detected.  */
1933
1934 static bool
1935 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1936 {
1937   struct loop *loop = (gimple_bb (phi))->loop_father;
1938   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1939   enum tree_code code;
1940   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1941   stmt_vec_info use_stmt_info, current_stmt_info;
1942   tree lhs;
1943   imm_use_iterator imm_iter;
1944   use_operand_p use_p;
1945   int nloop_uses, size = 0, n_out_of_loop_uses;
1946   bool found = false;
1947
1948   if (loop != vect_loop)
1949     return false;
1950
1951   lhs = PHI_RESULT (phi);
1952   code = gimple_assign_rhs_code (first_stmt);
1953   while (1)
1954     {
1955       nloop_uses = 0;
1956       n_out_of_loop_uses = 0;
1957       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1958         {
1959           gimple use_stmt = USE_STMT (use_p);
1960           if (is_gimple_debug (use_stmt))
1961             continue;
1962
1963           use_stmt = USE_STMT (use_p);
1964
1965           /* Check if we got back to the reduction phi.  */
1966           if (use_stmt == phi)
1967             {
1968               loop_use_stmt = use_stmt;
1969               found = true;
1970               break;
1971             }
1972
1973           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1974             {
1975               if (vinfo_for_stmt (use_stmt)
1976                   && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1977                 {
1978                   loop_use_stmt = use_stmt;
1979                   nloop_uses++;
1980                 }
1981             }
1982            else
1983              n_out_of_loop_uses++;
1984
1985            /* There are can be either a single use in the loop or two uses in
1986               phi nodes.  */
1987            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1988              return false;
1989         }
1990
1991       if (found)
1992         break;
1993
1994       /* We reached a statement with no loop uses.  */
1995       if (nloop_uses == 0)
1996         return false;
1997
1998       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
1999       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2000         return false;
2001
2002       if (!is_gimple_assign (loop_use_stmt)
2003           || code != gimple_assign_rhs_code (loop_use_stmt)
2004           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2005         return false;
2006
2007       /* Insert USE_STMT into reduction chain.  */
2008       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2009       if (current_stmt)
2010         {
2011           current_stmt_info = vinfo_for_stmt (current_stmt);
2012           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2013           GROUP_FIRST_ELEMENT (use_stmt_info)
2014             = GROUP_FIRST_ELEMENT (current_stmt_info);
2015         }
2016       else
2017         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2018
2019       lhs = gimple_assign_lhs (loop_use_stmt);
2020       current_stmt = loop_use_stmt;
2021       size++;
2022    }
2023
2024   if (!found || loop_use_stmt != phi || size < 2)
2025     return false;
2026
2027   /* Swap the operands, if needed, to make the reduction operand be the second
2028      operand.  */
2029   lhs = PHI_RESULT (phi);
2030   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2031   while (next_stmt)
2032     {
2033       if (gimple_assign_rhs2 (next_stmt) == lhs)
2034         {
2035           tree op = gimple_assign_rhs1 (next_stmt);
2036           gimple def_stmt = NULL;
2037
2038           if (TREE_CODE (op) == SSA_NAME)
2039             def_stmt = SSA_NAME_DEF_STMT (op);
2040
2041           /* Check that the other def is either defined in the loop
2042              ("vect_internal_def"), or it's an induction (defined by a
2043              loop-header phi-node).  */
2044           if (def_stmt
2045               && gimple_bb (def_stmt)
2046               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2047               && (is_gimple_assign (def_stmt)
2048                   || is_gimple_call (def_stmt)
2049                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2050                            == vect_induction_def
2051                   || (gimple_code (def_stmt) == GIMPLE_PHI
2052                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2053                                   == vect_internal_def
2054                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2055             {
2056               lhs = gimple_assign_lhs (next_stmt);
2057               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2058               continue;
2059             }
2060
2061           return false;
2062         }
2063       else
2064         {
2065           tree op = gimple_assign_rhs2 (next_stmt);
2066           gimple def_stmt = NULL;
2067
2068           if (TREE_CODE (op) == SSA_NAME)
2069             def_stmt = SSA_NAME_DEF_STMT (op);
2070
2071           /* Check that the other def is either defined in the loop
2072             ("vect_internal_def"), or it's an induction (defined by a
2073             loop-header phi-node).  */
2074           if (def_stmt
2075               && gimple_bb (def_stmt)
2076               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2077               && (is_gimple_assign (def_stmt)
2078                   || is_gimple_call (def_stmt)
2079                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2080                               == vect_induction_def
2081                   || (gimple_code (def_stmt) == GIMPLE_PHI
2082                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2083                                   == vect_internal_def
2084                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2085             {
2086               if (dump_enabled_p ())
2087                 {
2088                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2089                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2090                   dump_printf (MSG_NOTE, "\n");
2091                 }
2092
2093               swap_ssa_operands (next_stmt,
2094                                  gimple_assign_rhs1_ptr (next_stmt),
2095                                  gimple_assign_rhs2_ptr (next_stmt));
2096               update_stmt (next_stmt);
2097
2098               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2099                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2100             }
2101           else
2102             return false;
2103         }
2104
2105       lhs = gimple_assign_lhs (next_stmt);
2106       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2107     }
2108
2109   /* Save the chain for further analysis in SLP detection.  */
2110   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2111   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2112   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2113
2114   return true;
2115 }
2116
2117
2118 /* Function vect_is_simple_reduction_1
2119
2120    (1) Detect a cross-iteration def-use cycle that represents a simple
2121    reduction computation.  We look for the following pattern:
2122
2123    loop_header:
2124      a1 = phi < a0, a2 >
2125      a3 = ...
2126      a2 = operation (a3, a1)
2127
2128    or
2129
2130    a3 = ...
2131    loop_header:
2132      a1 = phi < a0, a2 >
2133      a2 = operation (a3, a1)
2134
2135    such that:
2136    1. operation is commutative and associative and it is safe to
2137       change the order of the computation (if CHECK_REDUCTION is true)
2138    2. no uses for a2 in the loop (a2 is used out of the loop)
2139    3. no uses of a1 in the loop besides the reduction operation
2140    4. no uses of a1 outside the loop.
2141
2142    Conditions 1,4 are tested here.
2143    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2144
2145    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2146    nested cycles, if CHECK_REDUCTION is false.
2147
2148    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2149    reductions:
2150
2151      a1 = phi < a0, a2 >
2152      inner loop (def of a3)
2153      a2 = phi < a3 >
2154
2155    If MODIFY is true it tries also to rework the code in-place to enable
2156    detection of more reduction patterns.  For the time being we rewrite
2157    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2158 */
2159
2160 static gimple
2161 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2162                             bool check_reduction, bool *double_reduc,
2163                             bool modify)
2164 {
2165   struct loop *loop = (gimple_bb (phi))->loop_father;
2166   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2167   edge latch_e = loop_latch_edge (loop);
2168   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2169   gimple def_stmt, def1 = NULL, def2 = NULL;
2170   enum tree_code orig_code, code;
2171   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2172   tree type;
2173   int nloop_uses;
2174   tree name;
2175   imm_use_iterator imm_iter;
2176   use_operand_p use_p;
2177   bool phi_def;
2178
2179   *double_reduc = false;
2180
2181   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2182      otherwise, we assume outer loop vectorization.  */
2183   gcc_assert ((check_reduction && loop == vect_loop)
2184               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2185
2186   name = PHI_RESULT (phi);
2187   nloop_uses = 0;
2188   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2189     {
2190       gimple use_stmt = USE_STMT (use_p);
2191       if (is_gimple_debug (use_stmt))
2192         continue;
2193
2194       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2195         {
2196           if (dump_enabled_p ())
2197             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2198                              "intermediate value used outside loop.\n");
2199
2200           return NULL;
2201         }
2202
2203       if (vinfo_for_stmt (use_stmt)
2204           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2205         nloop_uses++;
2206       if (nloop_uses > 1)
2207         {
2208           if (dump_enabled_p ())
2209             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2210                              "reduction used in loop.\n");
2211           return NULL;
2212         }
2213     }
2214
2215   if (TREE_CODE (loop_arg) != SSA_NAME)
2216     {
2217       if (dump_enabled_p ())
2218         {
2219           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2220                            "reduction: not ssa_name: ");
2221           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2222           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2223         }
2224       return NULL;
2225     }
2226
2227   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2228   if (!def_stmt)
2229     {
2230       if (dump_enabled_p ())
2231         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2232                          "reduction: no def_stmt.\n");
2233       return NULL;
2234     }
2235
2236   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2237     {
2238       if (dump_enabled_p ())
2239         {
2240           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2241           dump_printf (MSG_NOTE, "\n");
2242         }
2243       return NULL;
2244     }
2245
2246   if (is_gimple_assign (def_stmt))
2247     {
2248       name = gimple_assign_lhs (def_stmt);
2249       phi_def = false;
2250     }
2251   else
2252     {
2253       name = PHI_RESULT (def_stmt);
2254       phi_def = true;
2255     }
2256
2257   nloop_uses = 0;
2258   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2259     {
2260       gimple use_stmt = USE_STMT (use_p);
2261       if (is_gimple_debug (use_stmt))
2262         continue;
2263       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2264           && vinfo_for_stmt (use_stmt)
2265           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2266         nloop_uses++;
2267       if (nloop_uses > 1)
2268         {
2269           if (dump_enabled_p ())
2270             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2271                              "reduction used in loop.\n");
2272           return NULL;
2273         }
2274     }
2275
2276   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2277      defined in the inner loop.  */
2278   if (phi_def)
2279     {
2280       op1 = PHI_ARG_DEF (def_stmt, 0);
2281
2282       if (gimple_phi_num_args (def_stmt) != 1
2283           || TREE_CODE (op1) != SSA_NAME)
2284         {
2285           if (dump_enabled_p ())
2286             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2287                              "unsupported phi node definition.\n");
2288
2289           return NULL;
2290         }
2291
2292       def1 = SSA_NAME_DEF_STMT (op1);
2293       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2294           && loop->inner
2295           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2296           && is_gimple_assign (def1))
2297         {
2298           if (dump_enabled_p ())
2299             report_vect_op (MSG_NOTE, def_stmt,
2300                             "detected double reduction: ");
2301
2302           *double_reduc = true;
2303           return def_stmt;
2304         }
2305
2306       return NULL;
2307     }
2308
2309   code = orig_code = gimple_assign_rhs_code (def_stmt);
2310
2311   /* We can handle "res -= x[i]", which is non-associative by
2312      simply rewriting this into "res += -x[i]".  Avoid changing
2313      gimple instruction for the first simple tests and only do this
2314      if we're allowed to change code at all.  */
2315   if (code == MINUS_EXPR
2316       && modify
2317       && (op1 = gimple_assign_rhs1 (def_stmt))
2318       && TREE_CODE (op1) == SSA_NAME
2319       && SSA_NAME_DEF_STMT (op1) == phi)
2320     code = PLUS_EXPR;
2321
2322   if (check_reduction
2323       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2324     {
2325       if (dump_enabled_p ())
2326         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2327                         "reduction: not commutative/associative: ");
2328       return NULL;
2329     }
2330
2331   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2332     {
2333       if (code != COND_EXPR)
2334         {
2335           if (dump_enabled_p ())
2336             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2337                             "reduction: not binary operation: ");
2338
2339           return NULL;
2340         }
2341
2342       op3 = gimple_assign_rhs1 (def_stmt);
2343       if (COMPARISON_CLASS_P (op3))
2344         {
2345           op4 = TREE_OPERAND (op3, 1);
2346           op3 = TREE_OPERAND (op3, 0);
2347         }
2348
2349       op1 = gimple_assign_rhs2 (def_stmt);
2350       op2 = gimple_assign_rhs3 (def_stmt);
2351
2352       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2353         {
2354           if (dump_enabled_p ())
2355             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2356                             "reduction: uses not ssa_names: ");
2357
2358           return NULL;
2359         }
2360     }
2361   else
2362     {
2363       op1 = gimple_assign_rhs1 (def_stmt);
2364       op2 = gimple_assign_rhs2 (def_stmt);
2365
2366       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2367         {
2368           if (dump_enabled_p ())
2369             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2370                             "reduction: uses not ssa_names: ");
2371
2372           return NULL;
2373         }
2374    }
2375
2376   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2377   if ((TREE_CODE (op1) == SSA_NAME
2378        && !types_compatible_p (type,TREE_TYPE (op1)))
2379       || (TREE_CODE (op2) == SSA_NAME
2380           && !types_compatible_p (type, TREE_TYPE (op2)))
2381       || (op3 && TREE_CODE (op3) == SSA_NAME
2382           && !types_compatible_p (type, TREE_TYPE (op3)))
2383       || (op4 && TREE_CODE (op4) == SSA_NAME
2384           && !types_compatible_p (type, TREE_TYPE (op4))))
2385     {
2386       if (dump_enabled_p ())
2387         {
2388           dump_printf_loc (MSG_NOTE, vect_location,
2389                            "reduction: multiple types: operation type: ");
2390           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2391           dump_printf (MSG_NOTE, ", operands types: ");
2392           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2393                              TREE_TYPE (op1));
2394           dump_printf (MSG_NOTE, ",");
2395           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2396                              TREE_TYPE (op2));
2397           if (op3)
2398             {
2399               dump_printf (MSG_NOTE, ",");
2400               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2401                                  TREE_TYPE (op3));
2402             }
2403
2404           if (op4)
2405             {
2406               dump_printf (MSG_NOTE, ",");
2407               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2408                                  TREE_TYPE (op4));
2409             }
2410           dump_printf (MSG_NOTE, "\n");
2411         }
2412
2413       return NULL;
2414     }
2415
2416   /* Check that it's ok to change the order of the computation.
2417      Generally, when vectorizing a reduction we change the order of the
2418      computation.  This may change the behavior of the program in some
2419      cases, so we need to check that this is ok.  One exception is when
2420      vectorizing an outer-loop: the inner-loop is executed sequentially,
2421      and therefore vectorizing reductions in the inner-loop during
2422      outer-loop vectorization is safe.  */
2423
2424   /* CHECKME: check for !flag_finite_math_only too?  */
2425   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2426       && check_reduction)
2427     {
2428       /* Changing the order of operations changes the semantics.  */
2429       if (dump_enabled_p ())
2430         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2431                         "reduction: unsafe fp math optimization: ");
2432       return NULL;
2433     }
2434   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2435            && check_reduction)
2436     {
2437       /* Changing the order of operations changes the semantics.  */
2438       if (dump_enabled_p ())
2439         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2440                         "reduction: unsafe int math optimization: ");
2441       return NULL;
2442     }
2443   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2444     {
2445       /* Changing the order of operations changes the semantics.  */
2446       if (dump_enabled_p ())
2447         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2448                         "reduction: unsafe fixed-point math optimization: ");
2449       return NULL;
2450     }
2451
2452   /* If we detected "res -= x[i]" earlier, rewrite it into
2453      "res += -x[i]" now.  If this turns out to be useless reassoc
2454      will clean it up again.  */
2455   if (orig_code == MINUS_EXPR)
2456     {
2457       tree rhs = gimple_assign_rhs2 (def_stmt);
2458       tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2459       gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2460                                                          rhs, NULL);
2461       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2462       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2463                                                           loop_info, NULL));
2464       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2465       gimple_assign_set_rhs2 (def_stmt, negrhs);
2466       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2467       update_stmt (def_stmt);
2468     }
2469
2470   /* Reduction is safe. We're dealing with one of the following:
2471      1) integer arithmetic and no trapv
2472      2) floating point arithmetic, and special flags permit this optimization
2473      3) nested cycle (i.e., outer loop vectorization).  */
2474   if (TREE_CODE (op1) == SSA_NAME)
2475     def1 = SSA_NAME_DEF_STMT (op1);
2476
2477   if (TREE_CODE (op2) == SSA_NAME)
2478     def2 = SSA_NAME_DEF_STMT (op2);
2479
2480   if (code != COND_EXPR
2481       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2482     {
2483       if (dump_enabled_p ())
2484         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2485       return NULL;
2486     }
2487
2488   /* Check that one def is the reduction def, defined by PHI,
2489      the other def is either defined in the loop ("vect_internal_def"),
2490      or it's an induction (defined by a loop-header phi-node).  */
2491
2492   if (def2 && def2 == phi
2493       && (code == COND_EXPR
2494           || !def1 || gimple_nop_p (def1)
2495           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2496           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2497               && (is_gimple_assign (def1)
2498                   || is_gimple_call (def1)
2499                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2500                       == vect_induction_def
2501                   || (gimple_code (def1) == GIMPLE_PHI
2502                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2503                           == vect_internal_def
2504                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2505     {
2506       if (dump_enabled_p ())
2507         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2508       return def_stmt;
2509     }
2510
2511   if (def1 && def1 == phi
2512       && (code == COND_EXPR
2513           || !def2 || gimple_nop_p (def2)
2514           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2515           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2516               && (is_gimple_assign (def2)
2517                   || is_gimple_call (def2)
2518                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2519                       == vect_induction_def
2520                   || (gimple_code (def2) == GIMPLE_PHI
2521                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2522                           == vect_internal_def
2523                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2524     {
2525       if (check_reduction)
2526         {
2527           /* Swap operands (just for simplicity - so that the rest of the code
2528              can assume that the reduction variable is always the last (second)
2529              argument).  */
2530           if (dump_enabled_p ())
2531             report_vect_op (MSG_NOTE, def_stmt,
2532                             "detected reduction: need to swap operands: ");
2533
2534           swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2535                              gimple_assign_rhs2_ptr (def_stmt));
2536
2537           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2538             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2539         }
2540       else
2541         {
2542           if (dump_enabled_p ())
2543             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2544         }
2545
2546       return def_stmt;
2547     }
2548
2549   /* Try to find SLP reduction chain.  */
2550   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2551     {
2552       if (dump_enabled_p ())
2553         report_vect_op (MSG_NOTE, def_stmt,
2554                         "reduction: detected reduction chain: ");
2555
2556       return def_stmt;
2557     }
2558
2559   if (dump_enabled_p ())
2560     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2561                     "reduction: unknown pattern: ");
2562
2563   return NULL;
2564 }
2565
2566 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2567    in-place.  Arguments as there.  */
2568
2569 static gimple
2570 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2571                           bool check_reduction, bool *double_reduc)
2572 {
2573   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2574                                      double_reduc, false);
2575 }
2576
2577 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2578    in-place if it enables detection of more reductions.  Arguments
2579    as there.  */
2580
2581 gimple
2582 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2583                           bool check_reduction, bool *double_reduc)
2584 {
2585   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2586                                      double_reduc, true);
2587 }
2588
2589 /* Calculate the cost of one scalar iteration of the loop.  */
2590 int
2591 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2592 {
2593   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2594   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2595   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2596   int innerloop_iters, i, stmt_cost;
2597
2598   /* Count statements in scalar loop.  Using this as scalar cost for a single
2599      iteration for now.
2600
2601      TODO: Add outer loop support.
2602
2603      TODO: Consider assigning different costs to different scalar
2604      statements.  */
2605
2606   /* FORNOW.  */
2607   innerloop_iters = 1;
2608   if (loop->inner)
2609     innerloop_iters = 50; /* FIXME */
2610
2611   for (i = 0; i < nbbs; i++)
2612     {
2613       gimple_stmt_iterator si;
2614       basic_block bb = bbs[i];
2615
2616       if (bb->loop_father == loop->inner)
2617         factor = innerloop_iters;
2618       else
2619         factor = 1;
2620
2621       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2622         {
2623           gimple stmt = gsi_stmt (si);
2624           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2625
2626           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2627             continue;
2628
2629           /* Skip stmts that are not vectorized inside the loop.  */
2630           if (stmt_info
2631               && !STMT_VINFO_RELEVANT_P (stmt_info)
2632               && (!STMT_VINFO_LIVE_P (stmt_info)
2633                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2634               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2635             continue;
2636
2637           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2638             {
2639               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2640                stmt_cost = vect_get_stmt_cost (scalar_load);
2641              else
2642                stmt_cost = vect_get_stmt_cost (scalar_store);
2643             }
2644           else
2645             stmt_cost = vect_get_stmt_cost (scalar_stmt);
2646
2647           scalar_single_iter_cost += stmt_cost * factor;
2648         }
2649     }
2650   return scalar_single_iter_cost;
2651 }
2652
2653 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2654 int
2655 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2656                              int *peel_iters_epilogue,
2657                              int scalar_single_iter_cost,
2658                              stmt_vector_for_cost *prologue_cost_vec,
2659                              stmt_vector_for_cost *epilogue_cost_vec)
2660 {
2661   int retval = 0;
2662   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2663
2664   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2665     {
2666       *peel_iters_epilogue = vf/2;
2667       if (dump_enabled_p ())
2668         dump_printf_loc (MSG_NOTE, vect_location,
2669                          "cost model: epilogue peel iters set to vf/2 "
2670                          "because loop iterations are unknown .\n");
2671
2672       /* If peeled iterations are known but number of scalar loop
2673          iterations are unknown, count a taken branch per peeled loop.  */
2674       retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2675                                  NULL, 0, vect_prologue);
2676     }
2677   else
2678     {
2679       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2680       peel_iters_prologue = niters < peel_iters_prologue ?
2681                             niters : peel_iters_prologue;
2682       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2683       /* If we need to peel for gaps, but no peeling is required, we have to
2684          peel VF iterations.  */
2685       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2686         *peel_iters_epilogue = vf;
2687     }
2688
2689   if (peel_iters_prologue)
2690     retval += record_stmt_cost (prologue_cost_vec,
2691                                 peel_iters_prologue * scalar_single_iter_cost,
2692                                 scalar_stmt, NULL, 0, vect_prologue);
2693   if (*peel_iters_epilogue)
2694     retval += record_stmt_cost (epilogue_cost_vec,
2695                                 *peel_iters_epilogue * scalar_single_iter_cost,
2696                                 scalar_stmt, NULL, 0, vect_epilogue);
2697   return retval;
2698 }
2699
2700 /* Function vect_estimate_min_profitable_iters
2701
2702    Return the number of iterations required for the vector version of the
2703    loop to be profitable relative to the cost of the scalar version of the
2704    loop.  */
2705
2706 static void
2707 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2708                                     int *ret_min_profitable_niters,
2709                                     int *ret_min_profitable_estimate)
2710 {
2711   int min_profitable_iters;
2712   int min_profitable_estimate;
2713   int peel_iters_prologue;
2714   int peel_iters_epilogue;
2715   unsigned vec_inside_cost = 0;
2716   int vec_outside_cost = 0;
2717   unsigned vec_prologue_cost = 0;
2718   unsigned vec_epilogue_cost = 0;
2719   int scalar_single_iter_cost = 0;
2720   int scalar_outside_cost = 0;
2721   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2722   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2723   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2724
2725   /* Cost model disabled.  */
2726   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2727     {
2728       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2729       *ret_min_profitable_niters = 0;
2730       *ret_min_profitable_estimate = 0;
2731       return;
2732     }
2733
2734   /* Requires loop versioning tests to handle misalignment.  */
2735   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2736     {
2737       /*  FIXME: Make cost depend on complexity of individual check.  */
2738       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2739       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2740                             vect_prologue);
2741       dump_printf (MSG_NOTE,
2742                    "cost model: Adding cost of checks for loop "
2743                    "versioning to treat misalignment.\n");
2744     }
2745
2746   /* Requires loop versioning with alias checks.  */
2747   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2748     {
2749       /*  FIXME: Make cost depend on complexity of individual check.  */
2750       unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2751       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2752                             vect_prologue);
2753       dump_printf (MSG_NOTE,
2754                    "cost model: Adding cost of checks for loop "
2755                    "versioning aliasing.\n");
2756     }
2757
2758   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2759       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2760     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2761                           vect_prologue);
2762
2763   /* Count statements in scalar loop.  Using this as scalar cost for a single
2764      iteration for now.
2765
2766      TODO: Add outer loop support.
2767
2768      TODO: Consider assigning different costs to different scalar
2769      statements.  */
2770
2771   scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2772
2773   /* Add additional cost for the peeled instructions in prologue and epilogue
2774      loop.
2775
2776      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2777      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2778
2779      TODO: Build an expression that represents peel_iters for prologue and
2780      epilogue to be used in a run-time test.  */
2781
2782   if (npeel  < 0)
2783     {
2784       peel_iters_prologue = vf/2;
2785       dump_printf (MSG_NOTE, "cost model: "
2786                    "prologue peel iters set to vf/2.\n");
2787
2788       /* If peeling for alignment is unknown, loop bound of main loop becomes
2789          unknown.  */
2790       peel_iters_epilogue = vf/2;
2791       dump_printf (MSG_NOTE, "cost model: "
2792                    "epilogue peel iters set to vf/2 because "
2793                    "peeling for alignment is unknown.\n");
2794
2795       /* If peeled iterations are unknown, count a taken branch and a not taken
2796          branch per peeled loop. Even if scalar loop iterations are known,
2797          vector iterations are not known since peeled prologue iterations are
2798          not known. Hence guards remain the same.  */
2799       (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2800                             NULL, 0, vect_prologue);
2801       (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2802                             NULL, 0, vect_prologue);
2803       /* FORNOW: Don't attempt to pass individual scalar instructions to
2804          the model; just assume linear cost for scalar iterations.  */
2805       (void) add_stmt_cost (target_cost_data,
2806                             peel_iters_prologue * scalar_single_iter_cost,
2807                             scalar_stmt, NULL, 0, vect_prologue);
2808       (void) add_stmt_cost (target_cost_data,
2809                             peel_iters_epilogue * scalar_single_iter_cost,
2810                             scalar_stmt, NULL, 0, vect_epilogue);
2811     }
2812   else
2813     {
2814       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2815       stmt_info_for_cost *si;
2816       int j;
2817       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2818
2819       prologue_cost_vec.create (2);
2820       epilogue_cost_vec.create (2);
2821       peel_iters_prologue = npeel;
2822
2823       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2824                                           &peel_iters_epilogue,
2825                                           scalar_single_iter_cost,
2826                                           &prologue_cost_vec,
2827                                           &epilogue_cost_vec);
2828
2829       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2830         {
2831           struct _stmt_vec_info *stmt_info
2832             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2833           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2834                                 si->misalign, vect_prologue);
2835         }
2836
2837       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2838         {
2839           struct _stmt_vec_info *stmt_info
2840             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2841           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2842                                 si->misalign, vect_epilogue);
2843         }
2844
2845       prologue_cost_vec.release ();
2846       epilogue_cost_vec.release ();
2847     }
2848
2849   /* FORNOW: The scalar outside cost is incremented in one of the
2850      following ways:
2851
2852      1. The vectorizer checks for alignment and aliasing and generates
2853      a condition that allows dynamic vectorization.  A cost model
2854      check is ANDED with the versioning condition.  Hence scalar code
2855      path now has the added cost of the versioning check.
2856
2857        if (cost > th & versioning_check)
2858          jmp to vector code
2859
2860      Hence run-time scalar is incremented by not-taken branch cost.
2861
2862      2. The vectorizer then checks if a prologue is required.  If the
2863      cost model check was not done before during versioning, it has to
2864      be done before the prologue check.
2865
2866        if (cost <= th)
2867          prologue = scalar_iters
2868        if (prologue == 0)
2869          jmp to vector code
2870        else
2871          execute prologue
2872        if (prologue == num_iters)
2873          go to exit
2874
2875      Hence the run-time scalar cost is incremented by a taken branch,
2876      plus a not-taken branch, plus a taken branch cost.
2877
2878      3. The vectorizer then checks if an epilogue is required.  If the
2879      cost model check was not done before during prologue check, it
2880      has to be done with the epilogue check.
2881
2882        if (prologue == 0)
2883          jmp to vector code
2884        else
2885          execute prologue
2886        if (prologue == num_iters)
2887          go to exit
2888        vector code:
2889          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2890            jmp to epilogue
2891
2892      Hence the run-time scalar cost should be incremented by 2 taken
2893      branches.
2894
2895      TODO: The back end may reorder the BBS's differently and reverse
2896      conditions/branch directions.  Change the estimates below to
2897      something more reasonable.  */
2898
2899   /* If the number of iterations is known and we do not do versioning, we can
2900      decide whether to vectorize at compile time.  Hence the scalar version
2901      do not carry cost model guard costs.  */
2902   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2903       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2904       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2905     {
2906       /* Cost model check occurs at versioning.  */
2907       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2908           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2909         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2910       else
2911         {
2912           /* Cost model check occurs at prologue generation.  */
2913           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2914             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2915               + vect_get_stmt_cost (cond_branch_not_taken);
2916           /* Cost model check occurs at epilogue generation.  */
2917           else
2918             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2919         }
2920     }
2921
2922   /* Complete the target-specific cost calculations.  */
2923   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2924                &vec_inside_cost, &vec_epilogue_cost);
2925
2926   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2927
2928   /* Calculate number of iterations required to make the vector version
2929      profitable, relative to the loop bodies only.  The following condition
2930      must hold true:
2931      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2932      where
2933      SIC = scalar iteration cost, VIC = vector iteration cost,
2934      VOC = vector outside cost, VF = vectorization factor,
2935      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2936      SOC = scalar outside cost for run time cost model check.  */
2937
2938   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2939     {
2940       if (vec_outside_cost <= 0)
2941         min_profitable_iters = 1;
2942       else
2943         {
2944           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2945                                   - vec_inside_cost * peel_iters_prologue
2946                                   - vec_inside_cost * peel_iters_epilogue)
2947                                  / ((scalar_single_iter_cost * vf)
2948                                     - vec_inside_cost);
2949
2950           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2951               <= (((int) vec_inside_cost * min_profitable_iters)
2952                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2953             min_profitable_iters++;
2954         }
2955     }
2956   /* vector version will never be profitable.  */
2957   else
2958     {
2959       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vect)
2960         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
2961                     "did not happen for a simd loop");
2962
2963       if (dump_enabled_p ())
2964         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2965                          "cost model: the vector iteration cost = %d "
2966                          "divided by the scalar iteration cost = %d "
2967                          "is greater or equal to the vectorization factor = %d"
2968                          ".\n",
2969                          vec_inside_cost, scalar_single_iter_cost, vf);
2970       *ret_min_profitable_niters = -1;
2971       *ret_min_profitable_estimate = -1;
2972       return;
2973     }
2974
2975   if (dump_enabled_p ())
2976     {
2977       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2978       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
2979                    vec_inside_cost);
2980       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
2981                    vec_prologue_cost);
2982       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
2983                    vec_epilogue_cost);
2984       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
2985                    scalar_single_iter_cost);
2986       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
2987                    scalar_outside_cost);
2988       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
2989                    vec_outside_cost);
2990       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
2991                    peel_iters_prologue);
2992       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
2993                    peel_iters_epilogue);
2994       dump_printf (MSG_NOTE,
2995                    "  Calculated minimum iters for profitability: %d\n",
2996                    min_profitable_iters);
2997       dump_printf (MSG_NOTE, "\n");
2998     }
2999
3000   min_profitable_iters =
3001         min_profitable_iters < vf ? vf : min_profitable_iters;
3002
3003   /* Because the condition we create is:
3004      if (niters <= min_profitable_iters)
3005        then skip the vectorized loop.  */
3006   min_profitable_iters--;
3007
3008   if (dump_enabled_p ())
3009     dump_printf_loc (MSG_NOTE, vect_location,
3010                      "  Runtime profitability threshold = %d\n",
3011                      min_profitable_iters);
3012
3013   *ret_min_profitable_niters = min_profitable_iters;
3014
3015   /* Calculate number of iterations required to make the vector version
3016      profitable, relative to the loop bodies only.
3017
3018      Non-vectorized variant is SIC * niters and it must win over vector
3019      variant on the expected loop trip count.  The following condition must hold true:
3020      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3021
3022   if (vec_outside_cost <= 0)
3023     min_profitable_estimate = 1;
3024   else
3025     {
3026       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3027                                  - vec_inside_cost * peel_iters_prologue
3028                                  - vec_inside_cost * peel_iters_epilogue)
3029                                  / ((scalar_single_iter_cost * vf)
3030                                    - vec_inside_cost);
3031     }
3032   min_profitable_estimate --;
3033   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3034   if (dump_enabled_p ())
3035     dump_printf_loc (MSG_NOTE, vect_location,
3036                      "  Static estimate profitability threshold = %d\n",
3037                       min_profitable_iters);
3038
3039   *ret_min_profitable_estimate = min_profitable_estimate;
3040 }
3041
3042
3043 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3044    functions. Design better to avoid maintenance issues.  */
3045
3046 /* Function vect_model_reduction_cost.
3047
3048    Models cost for a reduction operation, including the vector ops
3049    generated within the strip-mine loop, the initial definition before
3050    the loop, and the epilogue code that must be generated.  */
3051
3052 static bool
3053 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3054                            int ncopies)
3055 {
3056   int prologue_cost = 0, epilogue_cost = 0;
3057   enum tree_code code;
3058   optab optab;
3059   tree vectype;
3060   gimple stmt, orig_stmt;
3061   tree reduction_op;
3062   enum machine_mode mode;
3063   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3064   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3065   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3066
3067   /* Cost of reduction op inside loop.  */
3068   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3069                                         stmt_info, 0, vect_body);
3070   stmt = STMT_VINFO_STMT (stmt_info);
3071
3072   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3073     {
3074     case GIMPLE_SINGLE_RHS:
3075       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
3076       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
3077       break;
3078     case GIMPLE_UNARY_RHS:
3079       reduction_op = gimple_assign_rhs1 (stmt);
3080       break;
3081     case GIMPLE_BINARY_RHS:
3082       reduction_op = gimple_assign_rhs2 (stmt);
3083       break;
3084     case GIMPLE_TERNARY_RHS:
3085       reduction_op = gimple_assign_rhs3 (stmt);
3086       break;
3087     default:
3088       gcc_unreachable ();
3089     }
3090
3091   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3092   if (!vectype)
3093     {
3094       if (dump_enabled_p ())
3095         {
3096           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3097                            "unsupported data-type ");
3098           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3099                              TREE_TYPE (reduction_op));
3100           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3101         }
3102       return false;
3103    }
3104
3105   mode = TYPE_MODE (vectype);
3106   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3107
3108   if (!orig_stmt)
3109     orig_stmt = STMT_VINFO_STMT (stmt_info);
3110
3111   code = gimple_assign_rhs_code (orig_stmt);
3112
3113   /* Add in cost for initial definition.  */
3114   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3115                                   stmt_info, 0, vect_prologue);
3116
3117   /* Determine cost of epilogue code.
3118
3119      We have a reduction operator that will reduce the vector in one statement.
3120      Also requires scalar extract.  */
3121
3122   if (!nested_in_vect_loop_p (loop, orig_stmt))
3123     {
3124       if (reduc_code != ERROR_MARK)
3125         {
3126           epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3127                                           stmt_info, 0, vect_epilogue);
3128           epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3129                                           stmt_info, 0, vect_epilogue);
3130         }
3131       else
3132         {
3133           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3134           tree bitsize =
3135             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3136           int element_bitsize = tree_to_uhwi (bitsize);
3137           int nelements = vec_size_in_bits / element_bitsize;
3138
3139           optab = optab_for_tree_code (code, vectype, optab_default);
3140
3141           /* We have a whole vector shift available.  */
3142           if (VECTOR_MODE_P (mode)
3143               && optab_handler (optab, mode) != CODE_FOR_nothing
3144               && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3145             {
3146               /* Final reduction via vector shifts and the reduction operator.
3147                  Also requires scalar extract.  */
3148               epilogue_cost += add_stmt_cost (target_cost_data,
3149                                               exact_log2 (nelements) * 2,
3150                                               vector_stmt, stmt_info, 0,
3151                                               vect_epilogue);
3152               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3153                                               vec_to_scalar, stmt_info, 0,
3154                                               vect_epilogue);
3155             }
3156           else
3157             /* Use extracts and reduction op for final reduction.  For N
3158                elements, we have N extracts and N-1 reduction ops.  */
3159             epilogue_cost += add_stmt_cost (target_cost_data,
3160                                             nelements + nelements - 1,
3161                                             vector_stmt, stmt_info, 0,
3162                                             vect_epilogue);
3163         }
3164     }
3165
3166   if (dump_enabled_p ())
3167     dump_printf (MSG_NOTE,
3168                  "vect_model_reduction_cost: inside_cost = %d, "
3169                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3170                  prologue_cost, epilogue_cost);
3171
3172   return true;
3173 }
3174
3175
3176 /* Function vect_model_induction_cost.
3177
3178    Models cost for induction operations.  */
3179
3180 static void
3181 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3182 {
3183   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3184   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3185   unsigned inside_cost, prologue_cost;
3186
3187   /* loop cost for vec_loop.  */
3188   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3189                                stmt_info, 0, vect_body);
3190
3191   /* prologue cost for vec_init and vec_step.  */
3192   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3193                                  stmt_info, 0, vect_prologue);
3194
3195   if (dump_enabled_p ())
3196     dump_printf_loc (MSG_NOTE, vect_location,
3197                      "vect_model_induction_cost: inside_cost = %d, "
3198                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3199 }
3200
3201
3202 /* Function get_initial_def_for_induction
3203
3204    Input:
3205    STMT - a stmt that performs an induction operation in the loop.
3206    IV_PHI - the initial value of the induction variable
3207
3208    Output:
3209    Return a vector variable, initialized with the first VF values of
3210    the induction variable.  E.g., for an iv with IV_PHI='X' and
3211    evolution S, for a vector of 4 units, we want to return:
3212    [X, X + S, X + 2*S, X + 3*S].  */
3213
3214 static tree
3215 get_initial_def_for_induction (gimple iv_phi)
3216 {
3217   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3218   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3219   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3220   tree vectype;
3221   int nunits;
3222   edge pe = loop_preheader_edge (loop);
3223   struct loop *iv_loop;
3224   basic_block new_bb;
3225   tree new_vec, vec_init, vec_step, t;
3226   tree new_var;
3227   tree new_name;
3228   gimple init_stmt, induction_phi, new_stmt;
3229   tree induc_def, vec_def, vec_dest;
3230   tree init_expr, step_expr;
3231   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3232   int i;
3233   int ncopies;
3234   tree expr;
3235   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3236   bool nested_in_vect_loop = false;
3237   gimple_seq stmts = NULL;
3238   imm_use_iterator imm_iter;
3239   use_operand_p use_p;
3240   gimple exit_phi;
3241   edge latch_e;
3242   tree loop_arg;
3243   gimple_stmt_iterator si;
3244   basic_block bb = gimple_bb (iv_phi);
3245   tree stepvectype;
3246   tree resvectype;
3247
3248   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3249   if (nested_in_vect_loop_p (loop, iv_phi))
3250     {
3251       nested_in_vect_loop = true;
3252       iv_loop = loop->inner;
3253     }
3254   else
3255     iv_loop = loop;
3256   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3257
3258   latch_e = loop_latch_edge (iv_loop);
3259   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3260
3261   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3262   gcc_assert (step_expr != NULL_TREE);
3263
3264   pe = loop_preheader_edge (iv_loop);
3265   init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3266                                      loop_preheader_edge (iv_loop));
3267
3268   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3269   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3270   gcc_assert (vectype);
3271   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3272   ncopies = vf / nunits;
3273
3274   gcc_assert (phi_info);
3275   gcc_assert (ncopies >= 1);
3276
3277   /* Convert the step to the desired type.  */
3278   step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3279                                                   step_expr),
3280                                     &stmts, true, NULL_TREE);
3281   if (stmts)
3282     {
3283       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3284       gcc_assert (!new_bb);
3285     }
3286
3287   /* Find the first insertion point in the BB.  */
3288   si = gsi_after_labels (bb);
3289
3290   /* Create the vector that holds the initial_value of the induction.  */
3291   if (nested_in_vect_loop)
3292     {
3293       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3294          been created during vectorization of previous stmts.  We obtain it
3295          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3296       vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3297       /* If the initial value is not of proper type, convert it.  */
3298       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3299         {
3300           new_stmt = gimple_build_assign_with_ops
3301               (VIEW_CONVERT_EXPR,
3302                vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
3303                build1 (VIEW_CONVERT_EXPR, vectype, vec_init), NULL_TREE);
3304           vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3305           gimple_assign_set_lhs (new_stmt, vec_init);
3306           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3307                                                  new_stmt);
3308           gcc_assert (!new_bb);
3309           set_vinfo_for_stmt (new_stmt,
3310                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3311         }
3312     }
3313   else
3314     {
3315       vec<constructor_elt, va_gc> *v;
3316
3317       /* iv_loop is the loop to be vectorized. Create:
3318          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3319       new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3320                                        vect_scalar_var, "var_");
3321       new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3322                                                      init_expr),
3323                                        &stmts, false, new_var);
3324       if (stmts)
3325         {
3326           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3327           gcc_assert (!new_bb);
3328         }
3329
3330       vec_alloc (v, nunits);
3331       bool constant_p = is_gimple_min_invariant (new_name);
3332       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3333       for (i = 1; i < nunits; i++)
3334         {
3335           /* Create: new_name_i = new_name + step_expr  */
3336           new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3337                                   new_name, step_expr);
3338           if (!is_gimple_min_invariant (new_name))
3339             {
3340               init_stmt = gimple_build_assign (new_var, new_name);
3341               new_name = make_ssa_name (new_var, init_stmt);
3342               gimple_assign_set_lhs (init_stmt, new_name);
3343               new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3344               gcc_assert (!new_bb);
3345               if (dump_enabled_p ())
3346                 {
3347                   dump_printf_loc (MSG_NOTE, vect_location,
3348                                    "created new init_stmt: ");
3349                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3350                   dump_printf (MSG_NOTE, "\n");
3351                 }
3352               constant_p = false;
3353             }
3354           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3355         }
3356       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3357       if (constant_p)
3358         new_vec = build_vector_from_ctor (vectype, v);
3359       else
3360         new_vec = build_constructor (vectype, v);
3361       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3362     }
3363
3364
3365   /* Create the vector that holds the step of the induction.  */
3366   if (nested_in_vect_loop)
3367     /* iv_loop is nested in the loop to be vectorized. Generate:
3368        vec_step = [S, S, S, S]  */
3369     new_name = step_expr;
3370   else
3371     {
3372       /* iv_loop is the loop to be vectorized. Generate:
3373           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3374       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3375         {
3376           expr = build_int_cst (integer_type_node, vf);
3377           expr = fold_convert (TREE_TYPE (step_expr), expr);
3378         }
3379       else
3380         expr = build_int_cst (TREE_TYPE (step_expr), vf);
3381       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3382                               expr, step_expr);
3383       if (TREE_CODE (step_expr) == SSA_NAME)
3384         new_name = vect_init_vector (iv_phi, new_name,
3385                                      TREE_TYPE (step_expr), NULL);
3386     }
3387
3388   t = unshare_expr (new_name);
3389   gcc_assert (CONSTANT_CLASS_P (new_name)
3390               || TREE_CODE (new_name) == SSA_NAME);
3391   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3392   gcc_assert (stepvectype);
3393   new_vec = build_vector_from_val (stepvectype, t);
3394   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3395
3396
3397   /* Create the following def-use cycle:
3398      loop prolog:
3399          vec_init = ...
3400          vec_step = ...
3401      loop:
3402          vec_iv = PHI <vec_init, vec_loop>
3403          ...
3404          STMT
3405          ...
3406          vec_loop = vec_iv + vec_step;  */
3407
3408   /* Create the induction-phi that defines the induction-operand.  */
3409   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3410   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3411   set_vinfo_for_stmt (induction_phi,
3412                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3413   induc_def = PHI_RESULT (induction_phi);
3414
3415   /* Create the iv update inside the loop  */
3416   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3417                                            induc_def, vec_step);
3418   vec_def = make_ssa_name (vec_dest, new_stmt);
3419   gimple_assign_set_lhs (new_stmt, vec_def);
3420   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3421   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3422                                                    NULL));
3423
3424   /* Set the arguments of the phi node:  */
3425   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3426   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3427                UNKNOWN_LOCATION);
3428
3429
3430   /* In case that vectorization factor (VF) is bigger than the number
3431      of elements that we can fit in a vectype (nunits), we have to generate
3432      more than one vector stmt - i.e - we need to "unroll" the
3433      vector stmt by a factor VF/nunits.  For more details see documentation
3434      in vectorizable_operation.  */
3435
3436   if (ncopies > 1)
3437     {
3438       stmt_vec_info prev_stmt_vinfo;
3439       /* FORNOW. This restriction should be relaxed.  */
3440       gcc_assert (!nested_in_vect_loop);
3441
3442       /* Create the vector that holds the step of the induction.  */
3443       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3444         {
3445           expr = build_int_cst (integer_type_node, nunits);
3446           expr = fold_convert (TREE_TYPE (step_expr), expr);
3447         }
3448       else
3449         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3450       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3451                               expr, step_expr);
3452       if (TREE_CODE (step_expr) == SSA_NAME)
3453         new_name = vect_init_vector (iv_phi, new_name,
3454                                      TREE_TYPE (step_expr), NULL);
3455       t = unshare_expr (new_name);
3456       gcc_assert (CONSTANT_CLASS_P (new_name)
3457                   || TREE_CODE (new_name) == SSA_NAME);
3458       new_vec = build_vector_from_val (stepvectype, t);
3459       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3460
3461       vec_def = induc_def;
3462       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3463       for (i = 1; i < ncopies; i++)
3464         {
3465           /* vec_i = vec_prev + vec_step  */
3466           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3467                                                    vec_def, vec_step);
3468           vec_def = make_ssa_name (vec_dest, new_stmt);
3469           gimple_assign_set_lhs (new_stmt, vec_def);
3470
3471           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3472           if (!useless_type_conversion_p (resvectype, vectype))
3473             {
3474               new_stmt = gimple_build_assign_with_ops
3475                   (VIEW_CONVERT_EXPR,
3476                    vect_get_new_vect_var (resvectype, vect_simple_var,
3477                                           "vec_iv_"),
3478                    build1 (VIEW_CONVERT_EXPR, resvectype,
3479                            gimple_assign_lhs (new_stmt)), NULL_TREE);
3480               gimple_assign_set_lhs (new_stmt,
3481                                      make_ssa_name
3482                                        (gimple_assign_lhs (new_stmt), new_stmt));
3483               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3484             }
3485           set_vinfo_for_stmt (new_stmt,
3486                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3487           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3488           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3489         }
3490     }
3491
3492   if (nested_in_vect_loop)
3493     {
3494       /* Find the loop-closed exit-phi of the induction, and record
3495          the final vector of induction results:  */
3496       exit_phi = NULL;
3497       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3498         {
3499           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3500             {
3501               exit_phi = USE_STMT (use_p);
3502               break;
3503             }
3504         }
3505       if (exit_phi)
3506         {
3507           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3508           /* FORNOW. Currently not supporting the case that an inner-loop induction
3509              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3510           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3511                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3512
3513           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3514           if (dump_enabled_p ())
3515             {
3516               dump_printf_loc (MSG_NOTE, vect_location,
3517                                "vector of inductions after inner-loop:");
3518               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3519               dump_printf (MSG_NOTE, "\n");
3520             }
3521         }
3522     }
3523
3524
3525   if (dump_enabled_p ())
3526     {
3527       dump_printf_loc (MSG_NOTE, vect_location,
3528                        "transform induction: created def-use cycle: ");
3529       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3530       dump_printf (MSG_NOTE, "\n");
3531       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3532                         SSA_NAME_DEF_STMT (vec_def), 0);
3533       dump_printf (MSG_NOTE, "\n");
3534     }
3535
3536   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3537   if (!useless_type_conversion_p (resvectype, vectype))
3538     {
3539       new_stmt = gimple_build_assign_with_ops
3540          (VIEW_CONVERT_EXPR,
3541           vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3542           build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3543       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3544       gimple_assign_set_lhs (new_stmt, induc_def);
3545       si = gsi_after_labels (bb);
3546       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3547       set_vinfo_for_stmt (new_stmt,
3548                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3549       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3550         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3551     }
3552
3553   return induc_def;
3554 }
3555
3556
3557 /* Function get_initial_def_for_reduction
3558
3559    Input:
3560    STMT - a stmt that performs a reduction operation in the loop.
3561    INIT_VAL - the initial value of the reduction variable
3562
3563    Output:
3564    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3565         of the reduction (used for adjusting the epilog - see below).
3566    Return a vector variable, initialized according to the operation that STMT
3567         performs. This vector will be used as the initial value of the
3568         vector of partial results.
3569
3570    Option1 (adjust in epilog): Initialize the vector as follows:
3571      add/bit or/xor:    [0,0,...,0,0]
3572      mult/bit and:      [1,1,...,1,1]
3573      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3574    and when necessary (e.g. add/mult case) let the caller know
3575    that it needs to adjust the result by init_val.
3576
3577    Option2: Initialize the vector as follows:
3578      add/bit or/xor:    [init_val,0,0,...,0]
3579      mult/bit and:      [init_val,1,1,...,1]
3580      min/max/cond_expr: [init_val,init_val,...,init_val]
3581    and no adjustments are needed.
3582
3583    For example, for the following code:
3584
3585    s = init_val;
3586    for (i=0;i<n;i++)
3587      s = s + a[i];
3588
3589    STMT is 's = s + a[i]', and the reduction variable is 's'.
3590    For a vector of 4 units, we want to return either [0,0,0,init_val],
3591    or [0,0,0,0] and let the caller know that it needs to adjust
3592    the result at the end by 'init_val'.
3593
3594    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3595    initialization vector is simpler (same element in all entries), if
3596    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3597
3598    A cost model should help decide between these two schemes.  */
3599
3600 tree
3601 get_initial_def_for_reduction (gimple stmt, tree init_val,
3602                                tree *adjustment_def)
3603 {
3604   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3605   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3606   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3607   tree scalar_type = TREE_TYPE (init_val);
3608   tree vectype = get_vectype_for_scalar_type (scalar_type);
3609   int nunits;
3610   enum tree_code code = gimple_assign_rhs_code (stmt);
3611   tree def_for_init;
3612   tree init_def;
3613   tree *elts;
3614   int i;
3615   bool nested_in_vect_loop = false;
3616   tree init_value;
3617   REAL_VALUE_TYPE real_init_val = dconst0;
3618   int int_init_val = 0;
3619   gimple def_stmt = NULL;
3620
3621   gcc_assert (vectype);
3622   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3623
3624   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3625               || SCALAR_FLOAT_TYPE_P (scalar_type));
3626
3627   if (nested_in_vect_loop_p (loop, stmt))
3628     nested_in_vect_loop = true;
3629   else
3630     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3631
3632   /* In case of double reduction we only create a vector variable to be put
3633      in the reduction phi node.  The actual statement creation is done in
3634      vect_create_epilog_for_reduction.  */
3635   if (adjustment_def && nested_in_vect_loop
3636       && TREE_CODE (init_val) == SSA_NAME
3637       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3638       && gimple_code (def_stmt) == GIMPLE_PHI
3639       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3640       && vinfo_for_stmt (def_stmt)
3641       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3642           == vect_double_reduction_def)
3643     {
3644       *adjustment_def = NULL;
3645       return vect_create_destination_var (init_val, vectype);
3646     }
3647
3648   if (TREE_CONSTANT (init_val))
3649     {
3650       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3651         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3652       else
3653         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3654     }
3655   else
3656     init_value = init_val;
3657
3658   switch (code)
3659     {
3660       case WIDEN_SUM_EXPR:
3661       case DOT_PROD_EXPR:
3662       case PLUS_EXPR:
3663       case MINUS_EXPR:
3664       case BIT_IOR_EXPR:
3665       case BIT_XOR_EXPR:
3666       case MULT_EXPR:
3667       case BIT_AND_EXPR:
3668         /* ADJUSMENT_DEF is NULL when called from
3669            vect_create_epilog_for_reduction to vectorize double reduction.  */
3670         if (adjustment_def)
3671           {
3672             if (nested_in_vect_loop)
3673               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3674                                                               NULL);
3675             else
3676               *adjustment_def = init_val;
3677           }
3678
3679         if (code == MULT_EXPR)
3680           {
3681             real_init_val = dconst1;
3682             int_init_val = 1;
3683           }
3684
3685         if (code == BIT_AND_EXPR)
3686           int_init_val = -1;
3687
3688         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3689           def_for_init = build_real (scalar_type, real_init_val);
3690         else
3691           def_for_init = build_int_cst (scalar_type, int_init_val);
3692
3693         /* Create a vector of '0' or '1' except the first element.  */
3694         elts = XALLOCAVEC (tree, nunits);
3695         for (i = nunits - 2; i >= 0; --i)
3696           elts[i + 1] = def_for_init;
3697
3698         /* Option1: the first element is '0' or '1' as well.  */
3699         if (adjustment_def)
3700           {
3701             elts[0] = def_for_init;
3702             init_def = build_vector (vectype, elts);
3703             break;
3704           }
3705
3706         /* Option2: the first element is INIT_VAL.  */
3707         elts[0] = init_val;
3708         if (TREE_CONSTANT (init_val))
3709           init_def = build_vector (vectype, elts);
3710         else
3711           {
3712             vec<constructor_elt, va_gc> *v;
3713             vec_alloc (v, nunits);
3714             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3715             for (i = 1; i < nunits; ++i)
3716               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3717             init_def = build_constructor (vectype, v);
3718           }
3719
3720         break;
3721
3722       case MIN_EXPR:
3723       case MAX_EXPR:
3724       case COND_EXPR:
3725         if (adjustment_def)
3726           {
3727             *adjustment_def = NULL_TREE;
3728             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3729             break;
3730           }
3731
3732         init_def = build_vector_from_val (vectype, init_value);
3733         break;
3734
3735       default:
3736         gcc_unreachable ();
3737     }
3738
3739   return init_def;
3740 }
3741
3742
3743 /* Function vect_create_epilog_for_reduction
3744
3745    Create code at the loop-epilog to finalize the result of a reduction
3746    computation.
3747
3748    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3749      reduction statements.
3750    STMT is the scalar reduction stmt that is being vectorized.
3751    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3752      number of elements that we can fit in a vectype (nunits).  In this case
3753      we have to generate more than one vector stmt - i.e - we need to "unroll"
3754      the vector stmt by a factor VF/nunits.  For more details see documentation
3755      in vectorizable_operation.
3756    REDUC_CODE is the tree-code for the epilog reduction.
3757    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3758      computation.
3759    REDUC_INDEX is the index of the operand in the right hand side of the
3760      statement that is defined by REDUCTION_PHI.
3761    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3762    SLP_NODE is an SLP node containing a group of reduction statements. The
3763      first one in this group is STMT.
3764
3765    This function:
3766    1. Creates the reduction def-use cycles: sets the arguments for
3767       REDUCTION_PHIS:
3768       The loop-entry argument is the vectorized initial-value of the reduction.
3769       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3770       sums.
3771    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3772       by applying the operation specified by REDUC_CODE if available, or by
3773       other means (whole-vector shifts or a scalar loop).
3774       The function also creates a new phi node at the loop exit to preserve
3775       loop-closed form, as illustrated below.
3776
3777      The flow at the entry to this function:
3778
3779         loop:
3780           vec_def = phi <null, null>            # REDUCTION_PHI
3781           VECT_DEF = vector_stmt                # vectorized form of STMT
3782           s_loop = scalar_stmt                  # (scalar) STMT
3783         loop_exit:
3784           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3785           use <s_out0>
3786           use <s_out0>
3787
3788      The above is transformed by this function into:
3789
3790         loop:
3791           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3792           VECT_DEF = vector_stmt                # vectorized form of STMT
3793           s_loop = scalar_stmt                  # (scalar) STMT
3794         loop_exit:
3795           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3796           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3797           v_out2 = reduce <v_out1>
3798           s_out3 = extract_field <v_out2, 0>
3799           s_out4 = adjust_result <s_out3>
3800           use <s_out4>
3801           use <s_out4>
3802 */
3803
3804 static void
3805 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3806                                   int ncopies, enum tree_code reduc_code,
3807                                   vec<gimple> reduction_phis,
3808                                   int reduc_index, bool double_reduc,
3809                                   slp_tree slp_node)
3810 {
3811   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3812   stmt_vec_info prev_phi_info;
3813   tree vectype;
3814   enum machine_mode mode;
3815   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3816   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3817   basic_block exit_bb;
3818   tree scalar_dest;
3819   tree scalar_type;
3820   gimple new_phi = NULL, phi;
3821   gimple_stmt_iterator exit_gsi;
3822   tree vec_dest;
3823   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3824   gimple epilog_stmt = NULL;
3825   enum tree_code code = gimple_assign_rhs_code (stmt);
3826   gimple exit_phi;
3827   tree bitsize, bitpos;
3828   tree adjustment_def = NULL;
3829   tree vec_initial_def = NULL;
3830   tree reduction_op, expr, def;
3831   tree orig_name, scalar_result;
3832   imm_use_iterator imm_iter, phi_imm_iter;
3833   use_operand_p use_p, phi_use_p;
3834   bool extract_scalar_result = false;
3835   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3836   bool nested_in_vect_loop = false;
3837   auto_vec<gimple> new_phis;
3838   auto_vec<gimple> inner_phis;
3839   enum vect_def_type dt = vect_unknown_def_type;
3840   int j, i;
3841   auto_vec<tree> scalar_results;
3842   unsigned int group_size = 1, k, ratio;
3843   auto_vec<tree> vec_initial_defs;
3844   auto_vec<gimple> phis;
3845   bool slp_reduc = false;
3846   tree new_phi_result;
3847   gimple inner_phi = NULL;
3848
3849   if (slp_node)
3850     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3851
3852   if (nested_in_vect_loop_p (loop, stmt))
3853     {
3854       outer_loop = loop;
3855       loop = loop->inner;
3856       nested_in_vect_loop = true;
3857       gcc_assert (!slp_node);
3858     }
3859
3860   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3861     {
3862     case GIMPLE_SINGLE_RHS:
3863       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3864                   == ternary_op);
3865       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3866       break;
3867     case GIMPLE_UNARY_RHS:
3868       reduction_op = gimple_assign_rhs1 (stmt);
3869       break;
3870     case GIMPLE_BINARY_RHS:
3871       reduction_op = reduc_index ?
3872                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3873       break;
3874     case GIMPLE_TERNARY_RHS:
3875       reduction_op = gimple_op (stmt, reduc_index + 1);
3876       break;
3877     default:
3878       gcc_unreachable ();
3879     }
3880
3881   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3882   gcc_assert (vectype);
3883   mode = TYPE_MODE (vectype);
3884
3885   /* 1. Create the reduction def-use cycle:
3886      Set the arguments of REDUCTION_PHIS, i.e., transform
3887
3888         loop:
3889           vec_def = phi <null, null>            # REDUCTION_PHI
3890           VECT_DEF = vector_stmt                # vectorized form of STMT
3891           ...
3892
3893      into:
3894
3895         loop:
3896           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3897           VECT_DEF = vector_stmt                # vectorized form of STMT
3898           ...
3899
3900      (in case of SLP, do it for all the phis). */
3901
3902   /* Get the loop-entry arguments.  */
3903   if (slp_node)
3904     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3905                        NULL, slp_node, reduc_index);
3906   else
3907     {
3908       vec_initial_defs.create (1);
3909      /* For the case of reduction, vect_get_vec_def_for_operand returns
3910         the scalar def before the loop, that defines the initial value
3911         of the reduction variable.  */
3912       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3913                                                       &adjustment_def);
3914       vec_initial_defs.quick_push (vec_initial_def);
3915     }
3916
3917   /* Set phi nodes arguments.  */
3918   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
3919     {
3920       tree vec_init_def = vec_initial_defs[i];
3921       tree def = vect_defs[i];
3922       for (j = 0; j < ncopies; j++)
3923         {
3924           /* Set the loop-entry arg of the reduction-phi.  */
3925           add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3926                        UNKNOWN_LOCATION);
3927
3928           /* Set the loop-latch arg for the reduction-phi.  */
3929           if (j > 0)
3930             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3931
3932           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3933
3934           if (dump_enabled_p ())
3935             {
3936               dump_printf_loc (MSG_NOTE, vect_location,
3937                                "transform reduction: created def-use cycle: ");
3938               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3939               dump_printf (MSG_NOTE, "\n");
3940               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
3941               dump_printf (MSG_NOTE, "\n");
3942             }
3943
3944           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3945         }
3946     }
3947
3948   /* 2. Create epilog code.
3949         The reduction epilog code operates across the elements of the vector
3950         of partial results computed by the vectorized loop.
3951         The reduction epilog code consists of:
3952
3953         step 1: compute the scalar result in a vector (v_out2)
3954         step 2: extract the scalar result (s_out3) from the vector (v_out2)
3955         step 3: adjust the scalar result (s_out3) if needed.
3956
3957         Step 1 can be accomplished using one the following three schemes:
3958           (scheme 1) using reduc_code, if available.
3959           (scheme 2) using whole-vector shifts, if available.
3960           (scheme 3) using a scalar loop. In this case steps 1+2 above are
3961                      combined.
3962
3963           The overall epilog code looks like this:
3964
3965           s_out0 = phi <s_loop>         # original EXIT_PHI
3966           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
3967           v_out2 = reduce <v_out1>              # step 1
3968           s_out3 = extract_field <v_out2, 0>    # step 2
3969           s_out4 = adjust_result <s_out3>       # step 3
3970
3971           (step 3 is optional, and steps 1 and 2 may be combined).
3972           Lastly, the uses of s_out0 are replaced by s_out4.  */
3973
3974
3975   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3976          v_out1 = phi <VECT_DEF>
3977          Store them in NEW_PHIS.  */
3978
3979   exit_bb = single_exit (loop)->dest;
3980   prev_phi_info = NULL;
3981   new_phis.create (vect_defs.length ());
3982   FOR_EACH_VEC_ELT (vect_defs, i, def)
3983     {
3984       for (j = 0; j < ncopies; j++)
3985         {
3986           tree new_def = copy_ssa_name (def, NULL);
3987           phi = create_phi_node (new_def, exit_bb);
3988           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3989           if (j == 0)
3990             new_phis.quick_push (phi);
3991           else
3992             {
3993               def = vect_get_vec_def_for_stmt_copy (dt, def);
3994               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3995             }
3996
3997           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3998           prev_phi_info = vinfo_for_stmt (phi);
3999         }
4000     }
4001
4002   /* The epilogue is created for the outer-loop, i.e., for the loop being
4003      vectorized.  Create exit phis for the outer loop.  */
4004   if (double_reduc)
4005     {
4006       loop = outer_loop;
4007       exit_bb = single_exit (loop)->dest;
4008       inner_phis.create (vect_defs.length ());
4009       FOR_EACH_VEC_ELT (new_phis, i, phi)
4010         {
4011           tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
4012           gimple outer_phi = create_phi_node (new_result, exit_bb);
4013           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4014                            PHI_RESULT (phi));
4015           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4016                                                             loop_vinfo, NULL));
4017           inner_phis.quick_push (phi);
4018           new_phis[i] = outer_phi;
4019           prev_phi_info = vinfo_for_stmt (outer_phi);
4020           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4021             {
4022               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4023               new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
4024               outer_phi = create_phi_node (new_result, exit_bb);
4025               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4026                                PHI_RESULT (phi));
4027               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4028                                                         loop_vinfo, NULL));
4029               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4030               prev_phi_info = vinfo_for_stmt (outer_phi);
4031             }
4032         }
4033     }
4034
4035   exit_gsi = gsi_after_labels (exit_bb);
4036
4037   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4038          (i.e. when reduc_code is not available) and in the final adjustment
4039          code (if needed).  Also get the original scalar reduction variable as
4040          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4041          represents a reduction pattern), the tree-code and scalar-def are
4042          taken from the original stmt that the pattern-stmt (STMT) replaces.
4043          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4044          are taken from STMT.  */
4045
4046   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4047   if (!orig_stmt)
4048     {
4049       /* Regular reduction  */
4050       orig_stmt = stmt;
4051     }
4052   else
4053     {
4054       /* Reduction pattern  */
4055       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4056       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4057       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4058     }
4059
4060   code = gimple_assign_rhs_code (orig_stmt);
4061   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4062      partial results are added and not subtracted.  */
4063   if (code == MINUS_EXPR)
4064     code = PLUS_EXPR;
4065
4066   scalar_dest = gimple_assign_lhs (orig_stmt);
4067   scalar_type = TREE_TYPE (scalar_dest);
4068   scalar_results.create (group_size);
4069   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4070   bitsize = TYPE_SIZE (scalar_type);
4071
4072   /* In case this is a reduction in an inner-loop while vectorizing an outer
4073      loop - we don't need to extract a single scalar result at the end of the
4074      inner-loop (unless it is double reduction, i.e., the use of reduction is
4075      outside the outer-loop).  The final vector of partial results will be used
4076      in the vectorized outer-loop, or reduced to a scalar result at the end of
4077      the outer-loop.  */
4078   if (nested_in_vect_loop && !double_reduc)
4079     goto vect_finalize_reduction;
4080
4081   /* SLP reduction without reduction chain, e.g.,
4082      # a1 = phi <a2, a0>
4083      # b1 = phi <b2, b0>
4084      a2 = operation (a1)
4085      b2 = operation (b1)  */
4086   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4087
4088   /* In case of reduction chain, e.g.,
4089      # a1 = phi <a3, a0>
4090      a2 = operation (a1)
4091      a3 = operation (a2),
4092
4093      we may end up with more than one vector result.  Here we reduce them to
4094      one vector.  */
4095   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4096     {
4097       tree first_vect = PHI_RESULT (new_phis[0]);
4098       tree tmp;
4099       gimple new_vec_stmt = NULL;
4100
4101       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4102       for (k = 1; k < new_phis.length (); k++)
4103         {
4104           gimple next_phi = new_phis[k];
4105           tree second_vect = PHI_RESULT (next_phi);
4106
4107           tmp = build2 (code, vectype,  first_vect, second_vect);
4108           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4109           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4110           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4111           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4112         }
4113
4114       new_phi_result = first_vect;
4115       if (new_vec_stmt)
4116         {
4117           new_phis.truncate (0);
4118           new_phis.safe_push (new_vec_stmt);
4119         }
4120     }
4121   else
4122     new_phi_result = PHI_RESULT (new_phis[0]);
4123
4124   /* 2.3 Create the reduction code, using one of the three schemes described
4125          above. In SLP we simply need to extract all the elements from the
4126          vector (without reducing them), so we use scalar shifts.  */
4127   if (reduc_code != ERROR_MARK && !slp_reduc)
4128     {
4129       tree tmp;
4130
4131       /*** Case 1:  Create:
4132            v_out2 = reduc_expr <v_out1>  */
4133
4134       if (dump_enabled_p ())
4135         dump_printf_loc (MSG_NOTE, vect_location,
4136                          "Reduce using direct vector reduction.\n");
4137
4138       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4139       tmp = build1 (reduc_code, vectype, new_phi_result);
4140       epilog_stmt = gimple_build_assign (vec_dest, tmp);
4141       new_temp = make_ssa_name (vec_dest, epilog_stmt);
4142       gimple_assign_set_lhs (epilog_stmt, new_temp);
4143       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4144
4145       extract_scalar_result = true;
4146     }
4147   else
4148     {
4149       enum tree_code shift_code = ERROR_MARK;
4150       bool have_whole_vector_shift = true;
4151       int bit_offset;
4152       int element_bitsize = tree_to_uhwi (bitsize);
4153       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4154       tree vec_temp;
4155
4156       if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4157         shift_code = VEC_RSHIFT_EXPR;
4158       else
4159         have_whole_vector_shift = false;
4160
4161       /* Regardless of whether we have a whole vector shift, if we're
4162          emulating the operation via tree-vect-generic, we don't want
4163          to use it.  Only the first round of the reduction is likely
4164          to still be profitable via emulation.  */
4165       /* ??? It might be better to emit a reduction tree code here, so that
4166          tree-vect-generic can expand the first round via bit tricks.  */
4167       if (!VECTOR_MODE_P (mode))
4168         have_whole_vector_shift = false;
4169       else
4170         {
4171           optab optab = optab_for_tree_code (code, vectype, optab_default);
4172           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4173             have_whole_vector_shift = false;
4174         }
4175
4176       if (have_whole_vector_shift && !slp_reduc)
4177         {
4178           /*** Case 2: Create:
4179              for (offset = VS/2; offset >= element_size; offset/=2)
4180                 {
4181                   Create:  va' = vec_shift <va, offset>
4182                   Create:  va = vop <va, va'>
4183                 }  */
4184
4185           if (dump_enabled_p ())
4186             dump_printf_loc (MSG_NOTE, vect_location,
4187                              "Reduce using vector shifts\n");
4188
4189           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4190           new_temp = new_phi_result;
4191           for (bit_offset = vec_size_in_bits/2;
4192                bit_offset >= element_bitsize;
4193                bit_offset /= 2)
4194             {
4195               tree bitpos = size_int (bit_offset);
4196
4197               epilog_stmt = gimple_build_assign_with_ops (shift_code,
4198                                                vec_dest, new_temp, bitpos);
4199               new_name = make_ssa_name (vec_dest, epilog_stmt);
4200               gimple_assign_set_lhs (epilog_stmt, new_name);
4201               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4202
4203               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4204                                                           new_name, new_temp);
4205               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4206               gimple_assign_set_lhs (epilog_stmt, new_temp);
4207               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4208             }
4209
4210           extract_scalar_result = true;
4211         }
4212       else
4213         {
4214           tree rhs;
4215
4216           /*** Case 3: Create:
4217              s = extract_field <v_out2, 0>
4218              for (offset = element_size;
4219                   offset < vector_size;
4220                   offset += element_size;)
4221                {
4222                  Create:  s' = extract_field <v_out2, offset>
4223                  Create:  s = op <s, s'>  // For non SLP cases
4224                }  */
4225
4226           if (dump_enabled_p ())
4227             dump_printf_loc (MSG_NOTE, vect_location,
4228                              "Reduce using scalar code.\n");
4229
4230           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4231           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4232             {
4233               if (gimple_code (new_phi) == GIMPLE_PHI)
4234                 vec_temp = PHI_RESULT (new_phi);
4235               else
4236                 vec_temp = gimple_assign_lhs (new_phi);
4237               rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4238                             bitsize_zero_node);
4239               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4240               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4241               gimple_assign_set_lhs (epilog_stmt, new_temp);
4242               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4243
4244               /* In SLP we don't need to apply reduction operation, so we just
4245                  collect s' values in SCALAR_RESULTS.  */
4246               if (slp_reduc)
4247                 scalar_results.safe_push (new_temp);
4248
4249               for (bit_offset = element_bitsize;
4250                    bit_offset < vec_size_in_bits;
4251                    bit_offset += element_bitsize)
4252                 {
4253                   tree bitpos = bitsize_int (bit_offset);
4254                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4255                                      bitsize, bitpos);
4256
4257                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4258                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4259                   gimple_assign_set_lhs (epilog_stmt, new_name);
4260                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4261
4262                   if (slp_reduc)
4263                     {
4264                       /* In SLP we don't need to apply reduction operation, so
4265                          we just collect s' values in SCALAR_RESULTS.  */
4266                       new_temp = new_name;
4267                       scalar_results.safe_push (new_name);
4268                     }
4269                   else
4270                     {
4271                       epilog_stmt = gimple_build_assign_with_ops (code,
4272                                           new_scalar_dest, new_name, new_temp);
4273                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4274                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4275                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4276                     }
4277                 }
4278             }
4279
4280           /* The only case where we need to reduce scalar results in SLP, is
4281              unrolling.  If the size of SCALAR_RESULTS is greater than
4282              GROUP_SIZE, we reduce them combining elements modulo
4283              GROUP_SIZE.  */
4284           if (slp_reduc)
4285             {
4286               tree res, first_res, new_res;
4287               gimple new_stmt;
4288
4289               /* Reduce multiple scalar results in case of SLP unrolling.  */
4290               for (j = group_size; scalar_results.iterate (j, &res);
4291                    j++)
4292                 {
4293                   first_res = scalar_results[j % group_size];
4294                   new_stmt = gimple_build_assign_with_ops (code,
4295                                               new_scalar_dest, first_res, res);
4296                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4297                   gimple_assign_set_lhs (new_stmt, new_res);
4298                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4299                   scalar_results[j % group_size] = new_res;
4300                 }
4301             }
4302           else
4303             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4304             scalar_results.safe_push (new_temp);
4305
4306           extract_scalar_result = false;
4307         }
4308     }
4309
4310   /* 2.4  Extract the final scalar result.  Create:
4311           s_out3 = extract_field <v_out2, bitpos>  */
4312
4313   if (extract_scalar_result)
4314     {
4315       tree rhs;
4316
4317       if (dump_enabled_p ())
4318         dump_printf_loc (MSG_NOTE, vect_location,
4319                          "extract scalar result\n");
4320
4321       if (BYTES_BIG_ENDIAN)
4322         bitpos = size_binop (MULT_EXPR,
4323                              bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4324                              TYPE_SIZE (scalar_type));
4325       else
4326         bitpos = bitsize_zero_node;
4327
4328       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4329       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4330       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4331       gimple_assign_set_lhs (epilog_stmt, new_temp);
4332       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4333       scalar_results.safe_push (new_temp);
4334     }
4335
4336 vect_finalize_reduction:
4337
4338   if (double_reduc)
4339     loop = loop->inner;
4340
4341   /* 2.5 Adjust the final result by the initial value of the reduction
4342          variable. (When such adjustment is not needed, then
4343          'adjustment_def' is zero).  For example, if code is PLUS we create:
4344          new_temp = loop_exit_def + adjustment_def  */
4345
4346   if (adjustment_def)
4347     {
4348       gcc_assert (!slp_reduc);
4349       if (nested_in_vect_loop)
4350         {
4351           new_phi = new_phis[0];
4352           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4353           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4354           new_dest = vect_create_destination_var (scalar_dest, vectype);
4355         }
4356       else
4357         {
4358           new_temp = scalar_results[0];
4359           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4360           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4361           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4362         }
4363
4364       epilog_stmt = gimple_build_assign (new_dest, expr);
4365       new_temp = make_ssa_name (new_dest, epilog_stmt);
4366       gimple_assign_set_lhs (epilog_stmt, new_temp);
4367       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4368       if (nested_in_vect_loop)
4369         {
4370           set_vinfo_for_stmt (epilog_stmt,
4371                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4372                                                  NULL));
4373           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4374                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4375
4376           if (!double_reduc)
4377             scalar_results.quick_push (new_temp);
4378           else
4379             scalar_results[0] = new_temp;
4380         }
4381       else
4382         scalar_results[0] = new_temp;
4383
4384       new_phis[0] = epilog_stmt;
4385     }
4386
4387   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4388           phis with new adjusted scalar results, i.e., replace use <s_out0>
4389           with use <s_out4>.
4390
4391      Transform:
4392         loop_exit:
4393           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4394           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4395           v_out2 = reduce <v_out1>
4396           s_out3 = extract_field <v_out2, 0>
4397           s_out4 = adjust_result <s_out3>
4398           use <s_out0>
4399           use <s_out0>
4400
4401      into:
4402
4403         loop_exit:
4404           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4405           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4406           v_out2 = reduce <v_out1>
4407           s_out3 = extract_field <v_out2, 0>
4408           s_out4 = adjust_result <s_out3>
4409           use <s_out4>
4410           use <s_out4> */
4411
4412
4413   /* In SLP reduction chain we reduce vector results into one vector if
4414      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4415      the last stmt in the reduction chain, since we are looking for the loop
4416      exit phi node.  */
4417   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4418     {
4419       scalar_dest = gimple_assign_lhs (
4420                         SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4421       group_size = 1;
4422     }
4423
4424   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4425      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4426      need to match SCALAR_RESULTS with corresponding statements.  The first
4427      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4428      the first vector stmt, etc.
4429      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4430   if (group_size > new_phis.length ())
4431     {
4432       ratio = group_size / new_phis.length ();
4433       gcc_assert (!(group_size % new_phis.length ()));
4434     }
4435   else
4436     ratio = 1;
4437
4438   for (k = 0; k < group_size; k++)
4439     {
4440       if (k % ratio == 0)
4441         {
4442           epilog_stmt = new_phis[k / ratio];
4443           reduction_phi = reduction_phis[k / ratio];
4444           if (double_reduc)
4445             inner_phi = inner_phis[k / ratio];
4446         }
4447
4448       if (slp_reduc)
4449         {
4450           gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4451
4452           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4453           /* SLP statements can't participate in patterns.  */
4454           gcc_assert (!orig_stmt);
4455           scalar_dest = gimple_assign_lhs (current_stmt);
4456         }
4457
4458       phis.create (3);
4459       /* Find the loop-closed-use at the loop exit of the original scalar
4460          result.  (The reduction result is expected to have two immediate uses -
4461          one at the latch block, and one at the loop exit).  */
4462       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4463         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4464             && !is_gimple_debug (USE_STMT (use_p)))
4465           phis.safe_push (USE_STMT (use_p));
4466
4467       /* While we expect to have found an exit_phi because of loop-closed-ssa
4468          form we can end up without one if the scalar cycle is dead.  */
4469
4470       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4471         {
4472           if (outer_loop)
4473             {
4474               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4475               gimple vect_phi;
4476
4477               /* FORNOW. Currently not supporting the case that an inner-loop
4478                  reduction is not used in the outer-loop (but only outside the
4479                  outer-loop), unless it is double reduction.  */
4480               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4481                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4482                           || double_reduc);
4483
4484               STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4485               if (!double_reduc
4486                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4487                       != vect_double_reduction_def)
4488                 continue;
4489
4490               /* Handle double reduction:
4491
4492                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4493                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4494                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4495                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4496
4497                  At that point the regular reduction (stmt2 and stmt3) is
4498                  already vectorized, as well as the exit phi node, stmt4.
4499                  Here we vectorize the phi node of double reduction, stmt1, and
4500                  update all relevant statements.  */
4501
4502               /* Go through all the uses of s2 to find double reduction phi
4503                  node, i.e., stmt1 above.  */
4504               orig_name = PHI_RESULT (exit_phi);
4505               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4506                 {
4507                   stmt_vec_info use_stmt_vinfo;
4508                   stmt_vec_info new_phi_vinfo;
4509                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4510                   basic_block bb = gimple_bb (use_stmt);
4511                   gimple use;
4512
4513                   /* Check that USE_STMT is really double reduction phi
4514                      node.  */
4515                   if (gimple_code (use_stmt) != GIMPLE_PHI
4516                       || gimple_phi_num_args (use_stmt) != 2
4517                       || bb->loop_father != outer_loop)
4518                     continue;
4519                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4520                   if (!use_stmt_vinfo
4521                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4522                           != vect_double_reduction_def)
4523                     continue;
4524
4525                   /* Create vector phi node for double reduction:
4526                      vs1 = phi <vs0, vs2>
4527                      vs1 was created previously in this function by a call to
4528                        vect_get_vec_def_for_operand and is stored in
4529                        vec_initial_def;
4530                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4531                      vs0 is created here.  */
4532
4533                   /* Create vector phi node.  */
4534                   vect_phi = create_phi_node (vec_initial_def, bb);
4535                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4536                                     loop_vec_info_for_loop (outer_loop), NULL);
4537                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4538
4539                   /* Create vs0 - initial def of the double reduction phi.  */
4540                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4541                                              loop_preheader_edge (outer_loop));
4542                   init_def = get_initial_def_for_reduction (stmt,
4543                                                           preheader_arg, NULL);
4544                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4545                                                     vectype, NULL);
4546
4547                   /* Update phi node arguments with vs0 and vs2.  */
4548                   add_phi_arg (vect_phi, vect_phi_init,
4549                                loop_preheader_edge (outer_loop),
4550                                UNKNOWN_LOCATION);
4551                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4552                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4553                   if (dump_enabled_p ())
4554                     {
4555                       dump_printf_loc (MSG_NOTE, vect_location,
4556                                        "created double reduction phi node: ");
4557                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4558                       dump_printf (MSG_NOTE, "\n");
4559                     }
4560
4561                   vect_phi_res = PHI_RESULT (vect_phi);
4562
4563                   /* Replace the use, i.e., set the correct vs1 in the regular
4564                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4565                      loop is redundant.  */
4566                   use = reduction_phi;
4567                   for (j = 0; j < ncopies; j++)
4568                     {
4569                       edge pr_edge = loop_preheader_edge (loop);
4570                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4571                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4572                     }
4573                 }
4574             }
4575         }
4576
4577       phis.release ();
4578       if (nested_in_vect_loop)
4579         {
4580           if (double_reduc)
4581             loop = outer_loop;
4582           else
4583             continue;
4584         }
4585
4586       phis.create (3);
4587       /* Find the loop-closed-use at the loop exit of the original scalar
4588          result.  (The reduction result is expected to have two immediate uses,
4589          one at the latch block, and one at the loop exit).  For double
4590          reductions we are looking for exit phis of the outer loop.  */
4591       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4592         {
4593           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4594             {
4595               if (!is_gimple_debug (USE_STMT (use_p)))
4596                 phis.safe_push (USE_STMT (use_p));
4597             }
4598           else
4599             {
4600               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4601                 {
4602                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4603
4604                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4605                     {
4606                       if (!flow_bb_inside_loop_p (loop,
4607                                              gimple_bb (USE_STMT (phi_use_p)))
4608                           && !is_gimple_debug (USE_STMT (phi_use_p)))
4609                         phis.safe_push (USE_STMT (phi_use_p));
4610                     }
4611                 }
4612             }
4613         }
4614
4615       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4616         {
4617           /* Replace the uses:  */
4618           orig_name = PHI_RESULT (exit_phi);
4619           scalar_result = scalar_results[k];
4620           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4621             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4622               SET_USE (use_p, scalar_result);
4623         }
4624
4625       phis.release ();
4626     }
4627 }
4628
4629
4630 /* Function vectorizable_reduction.
4631
4632    Check if STMT performs a reduction operation that can be vectorized.
4633    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4634    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4635    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4636
4637    This function also handles reduction idioms (patterns) that have been
4638    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4639    of this form:
4640      X = pattern_expr (arg0, arg1, ..., X)
4641    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4642    sequence that had been detected and replaced by the pattern-stmt (STMT).
4643
4644    In some cases of reduction patterns, the type of the reduction variable X is
4645    different than the type of the other arguments of STMT.
4646    In such cases, the vectype that is used when transforming STMT into a vector
4647    stmt is different than the vectype that is used to determine the
4648    vectorization factor, because it consists of a different number of elements
4649    than the actual number of elements that are being operated upon in parallel.
4650
4651    For example, consider an accumulation of shorts into an int accumulator.
4652    On some targets it's possible to vectorize this pattern operating on 8
4653    shorts at a time (hence, the vectype for purposes of determining the
4654    vectorization factor should be V8HI); on the other hand, the vectype that
4655    is used to create the vector form is actually V4SI (the type of the result).
4656
4657    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4658    indicates what is the actual level of parallelism (V8HI in the example), so
4659    that the right vectorization factor would be derived.  This vectype
4660    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4661    be used to create the vectorized stmt.  The right vectype for the vectorized
4662    stmt is obtained from the type of the result X:
4663         get_vectype_for_scalar_type (TREE_TYPE (X))
4664
4665    This means that, contrary to "regular" reductions (or "regular" stmts in
4666    general), the following equation:
4667       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4668    does *NOT* necessarily hold for reduction patterns.  */
4669
4670 bool
4671 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4672                         gimple *vec_stmt, slp_tree slp_node)
4673 {
4674   tree vec_dest;
4675   tree scalar_dest;
4676   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4677   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4678   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4679   tree vectype_in = NULL_TREE;
4680   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4681   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4682   enum tree_code code, orig_code, epilog_reduc_code;
4683   enum machine_mode vec_mode;
4684   int op_type;
4685   optab optab, reduc_optab;
4686   tree new_temp = NULL_TREE;
4687   tree def;
4688   gimple def_stmt;
4689   enum vect_def_type dt;
4690   gimple new_phi = NULL;
4691   tree scalar_type;
4692   bool is_simple_use;
4693   gimple orig_stmt;
4694   stmt_vec_info orig_stmt_info;
4695   tree expr = NULL_TREE;
4696   int i;
4697   int ncopies;
4698   int epilog_copies;
4699   stmt_vec_info prev_stmt_info, prev_phi_info;
4700   bool single_defuse_cycle = false;
4701   tree reduc_def = NULL_TREE;
4702   gimple new_stmt = NULL;
4703   int j;
4704   tree ops[3];
4705   bool nested_cycle = false, found_nested_cycle_def = false;
4706   gimple reduc_def_stmt = NULL;
4707   /* The default is that the reduction variable is the last in statement.  */
4708   int reduc_index = 2;
4709   bool double_reduc = false, dummy;
4710   basic_block def_bb;
4711   struct loop * def_stmt_loop, *outer_loop = NULL;
4712   tree def_arg;
4713   gimple def_arg_stmt;
4714   auto_vec<tree> vec_oprnds0;
4715   auto_vec<tree> vec_oprnds1;
4716   auto_vec<tree> vect_defs;
4717   auto_vec<gimple> phis;
4718   int vec_num;
4719   tree def0, def1, tem, op0, op1 = NULL_TREE;
4720
4721   /* In case of reduction chain we switch to the first stmt in the chain, but
4722      we don't update STMT_INFO, since only the last stmt is marked as reduction
4723      and has reduction properties.  */
4724   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4725     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4726
4727   if (nested_in_vect_loop_p (loop, stmt))
4728     {
4729       outer_loop = loop;
4730       loop = loop->inner;
4731       nested_cycle = true;
4732     }
4733
4734   /* 1. Is vectorizable reduction?  */
4735   /* Not supportable if the reduction variable is used in the loop, unless
4736      it's a reduction chain.  */
4737   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4738       && !GROUP_FIRST_ELEMENT (stmt_info))
4739     return false;
4740
4741   /* Reductions that are not used even in an enclosing outer-loop,
4742      are expected to be "live" (used out of the loop).  */
4743   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4744       && !STMT_VINFO_LIVE_P (stmt_info))
4745     return false;
4746
4747   /* Make sure it was already recognized as a reduction computation.  */
4748   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4749       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4750     return false;
4751
4752   /* 2. Has this been recognized as a reduction pattern?
4753
4754      Check if STMT represents a pattern that has been recognized
4755      in earlier analysis stages.  For stmts that represent a pattern,
4756      the STMT_VINFO_RELATED_STMT field records the last stmt in
4757      the original sequence that constitutes the pattern.  */
4758
4759   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4760   if (orig_stmt)
4761     {
4762       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4763       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4764       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4765     }
4766
4767   /* 3. Check the operands of the operation.  The first operands are defined
4768         inside the loop body. The last operand is the reduction variable,
4769         which is defined by the loop-header-phi.  */
4770
4771   gcc_assert (is_gimple_assign (stmt));
4772
4773   /* Flatten RHS.  */
4774   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4775     {
4776     case GIMPLE_SINGLE_RHS:
4777       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4778       if (op_type == ternary_op)
4779         {
4780           tree rhs = gimple_assign_rhs1 (stmt);
4781           ops[0] = TREE_OPERAND (rhs, 0);
4782           ops[1] = TREE_OPERAND (rhs, 1);
4783           ops[2] = TREE_OPERAND (rhs, 2);
4784           code = TREE_CODE (rhs);
4785         }
4786       else
4787         return false;
4788       break;
4789
4790     case GIMPLE_BINARY_RHS:
4791       code = gimple_assign_rhs_code (stmt);
4792       op_type = TREE_CODE_LENGTH (code);
4793       gcc_assert (op_type == binary_op);
4794       ops[0] = gimple_assign_rhs1 (stmt);
4795       ops[1] = gimple_assign_rhs2 (stmt);
4796       break;
4797
4798     case GIMPLE_TERNARY_RHS:
4799       code = gimple_assign_rhs_code (stmt);
4800       op_type = TREE_CODE_LENGTH (code);
4801       gcc_assert (op_type == ternary_op);
4802       ops[0] = gimple_assign_rhs1 (stmt);
4803       ops[1] = gimple_assign_rhs2 (stmt);
4804       ops[2] = gimple_assign_rhs3 (stmt);
4805       break;
4806
4807     case GIMPLE_UNARY_RHS:
4808       return false;
4809
4810     default:
4811       gcc_unreachable ();
4812     }
4813
4814   if (code == COND_EXPR && slp_node)
4815     return false;
4816
4817   scalar_dest = gimple_assign_lhs (stmt);
4818   scalar_type = TREE_TYPE (scalar_dest);
4819   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4820       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4821     return false;
4822
4823   /* Do not try to vectorize bit-precision reductions.  */
4824   if ((TYPE_PRECISION (scalar_type)
4825        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4826     return false;
4827
4828   /* All uses but the last are expected to be defined in the loop.
4829      The last use is the reduction variable.  In case of nested cycle this
4830      assumption is not true: we use reduc_index to record the index of the
4831      reduction variable.  */
4832   for (i = 0; i < op_type - 1; i++)
4833     {
4834       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4835       if (i == 0 && code == COND_EXPR)
4836         continue;
4837
4838       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4839                                             &def_stmt, &def, &dt, &tem);
4840       if (!vectype_in)
4841         vectype_in = tem;
4842       gcc_assert (is_simple_use);
4843
4844       if (dt != vect_internal_def
4845           && dt != vect_external_def
4846           && dt != vect_constant_def
4847           && dt != vect_induction_def
4848           && !(dt == vect_nested_cycle && nested_cycle))
4849         return false;
4850
4851       if (dt == vect_nested_cycle)
4852         {
4853           found_nested_cycle_def = true;
4854           reduc_def_stmt = def_stmt;
4855           reduc_index = i;
4856         }
4857     }
4858
4859   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4860                                         &def_stmt, &def, &dt, &tem);
4861   if (!vectype_in)
4862     vectype_in = tem;
4863   gcc_assert (is_simple_use);
4864   if (!(dt == vect_reduction_def
4865         || dt == vect_nested_cycle
4866         || ((dt == vect_internal_def || dt == vect_external_def
4867              || dt == vect_constant_def || dt == vect_induction_def)
4868             && nested_cycle && found_nested_cycle_def)))
4869     {
4870       /* For pattern recognized stmts, orig_stmt might be a reduction,
4871          but some helper statements for the pattern might not, or
4872          might be COND_EXPRs with reduction uses in the condition.  */
4873       gcc_assert (orig_stmt);
4874       return false;
4875     }
4876   if (!found_nested_cycle_def)
4877     reduc_def_stmt = def_stmt;
4878
4879   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4880   if (orig_stmt)
4881     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4882                                                        reduc_def_stmt,
4883                                                        !nested_cycle,
4884                                                        &dummy));
4885   else
4886     {
4887       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4888                                              !nested_cycle, &dummy);
4889       /* We changed STMT to be the first stmt in reduction chain, hence we
4890          check that in this case the first element in the chain is STMT.  */
4891       gcc_assert (stmt == tmp
4892                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4893     }
4894
4895   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4896     return false;
4897
4898   if (slp_node || PURE_SLP_STMT (stmt_info))
4899     ncopies = 1;
4900   else
4901     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4902                / TYPE_VECTOR_SUBPARTS (vectype_in));
4903
4904   gcc_assert (ncopies >= 1);
4905
4906   vec_mode = TYPE_MODE (vectype_in);
4907
4908   if (code == COND_EXPR)
4909     {
4910       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4911         {
4912           if (dump_enabled_p ())
4913             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4914                              "unsupported condition in reduction\n");
4915
4916             return false;
4917         }
4918     }
4919   else
4920     {
4921       /* 4. Supportable by target?  */
4922
4923       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
4924           || code == LROTATE_EXPR || code == RROTATE_EXPR)
4925         {
4926           /* Shifts and rotates are only supported by vectorizable_shifts,
4927              not vectorizable_reduction.  */
4928           if (dump_enabled_p ())
4929             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4930                              "unsupported shift or rotation.\n");
4931           return false;
4932         }
4933
4934       /* 4.1. check support for the operation in the loop  */
4935       optab = optab_for_tree_code (code, vectype_in, optab_default);
4936       if (!optab)
4937         {
4938           if (dump_enabled_p ())
4939             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4940                              "no optab.\n");
4941
4942           return false;
4943         }
4944
4945       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4946         {
4947           if (dump_enabled_p ())
4948             dump_printf (MSG_NOTE, "op not supported by target.\n");
4949
4950           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4951               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4952                   < vect_min_worthwhile_factor (code))
4953             return false;
4954
4955           if (dump_enabled_p ())
4956             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
4957         }
4958
4959       /* Worthwhile without SIMD support?  */
4960       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4961           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4962              < vect_min_worthwhile_factor (code))
4963         {
4964           if (dump_enabled_p ())
4965             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4966                              "not worthwhile without SIMD support.\n");
4967
4968           return false;
4969         }
4970     }
4971
4972   /* 4.2. Check support for the epilog operation.
4973
4974           If STMT represents a reduction pattern, then the type of the
4975           reduction variable may be different than the type of the rest
4976           of the arguments.  For example, consider the case of accumulation
4977           of shorts into an int accumulator; The original code:
4978                         S1: int_a = (int) short_a;
4979           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
4980
4981           was replaced with:
4982                         STMT: int_acc = widen_sum <short_a, int_acc>
4983
4984           This means that:
4985           1. The tree-code that is used to create the vector operation in the
4986              epilog code (that reduces the partial results) is not the
4987              tree-code of STMT, but is rather the tree-code of the original
4988              stmt from the pattern that STMT is replacing.  I.e, in the example
4989              above we want to use 'widen_sum' in the loop, but 'plus' in the
4990              epilog.
4991           2. The type (mode) we use to check available target support
4992              for the vector operation to be created in the *epilog*, is
4993              determined by the type of the reduction variable (in the example
4994              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4995              However the type (mode) we use to check available target support
4996              for the vector operation to be created *inside the loop*, is
4997              determined by the type of the other arguments to STMT (in the
4998              example we'd check this: optab_handler (widen_sum_optab,
4999              vect_short_mode)).
5000
5001           This is contrary to "regular" reductions, in which the types of all
5002           the arguments are the same as the type of the reduction variable.
5003           For "regular" reductions we can therefore use the same vector type
5004           (and also the same tree-code) when generating the epilog code and
5005           when generating the code inside the loop.  */
5006
5007   if (orig_stmt)
5008     {
5009       /* This is a reduction pattern: get the vectype from the type of the
5010          reduction variable, and get the tree-code from orig_stmt.  */
5011       orig_code = gimple_assign_rhs_code (orig_stmt);
5012       gcc_assert (vectype_out);
5013       vec_mode = TYPE_MODE (vectype_out);
5014     }
5015   else
5016     {
5017       /* Regular reduction: use the same vectype and tree-code as used for
5018          the vector code inside the loop can be used for the epilog code. */
5019       orig_code = code;
5020     }
5021
5022   if (nested_cycle)
5023     {
5024       def_bb = gimple_bb (reduc_def_stmt);
5025       def_stmt_loop = def_bb->loop_father;
5026       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5027                                        loop_preheader_edge (def_stmt_loop));
5028       if (TREE_CODE (def_arg) == SSA_NAME
5029           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5030           && gimple_code (def_arg_stmt) == GIMPLE_PHI
5031           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5032           && vinfo_for_stmt (def_arg_stmt)
5033           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5034               == vect_double_reduction_def)
5035         double_reduc = true;
5036     }
5037
5038   epilog_reduc_code = ERROR_MARK;
5039   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5040     {
5041       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5042                                          optab_default);
5043       if (!reduc_optab)
5044         {
5045           if (dump_enabled_p ())
5046             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5047                              "no optab for reduction.\n");
5048
5049           epilog_reduc_code = ERROR_MARK;
5050         }
5051
5052       if (reduc_optab
5053           && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5054         {
5055           if (dump_enabled_p ())
5056             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5057                              "reduc op not supported by target.\n");
5058
5059           epilog_reduc_code = ERROR_MARK;
5060         }
5061     }
5062   else
5063     {
5064       if (!nested_cycle || double_reduc)
5065         {
5066           if (dump_enabled_p ())
5067             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5068                              "no reduc code for scalar code.\n");
5069
5070           return false;
5071         }
5072     }
5073
5074   if (double_reduc && ncopies > 1)
5075     {
5076       if (dump_enabled_p ())
5077         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5078                          "multiple types in double reduction\n");
5079
5080       return false;
5081     }
5082
5083   /* In case of widenning multiplication by a constant, we update the type
5084      of the constant to be the type of the other operand.  We check that the
5085      constant fits the type in the pattern recognition pass.  */
5086   if (code == DOT_PROD_EXPR
5087       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5088     {
5089       if (TREE_CODE (ops[0]) == INTEGER_CST)
5090         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5091       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5092         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5093       else
5094         {
5095           if (dump_enabled_p ())
5096             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5097                              "invalid types in dot-prod\n");
5098
5099           return false;
5100         }
5101     }
5102
5103   if (!vec_stmt) /* transformation not required.  */
5104     {
5105       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
5106         return false;
5107       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5108       return true;
5109     }
5110
5111   /** Transform.  **/
5112
5113   if (dump_enabled_p ())
5114     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5115
5116   /* FORNOW: Multiple types are not supported for condition.  */
5117   if (code == COND_EXPR)
5118     gcc_assert (ncopies == 1);
5119
5120   /* Create the destination vector  */
5121   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5122
5123   /* In case the vectorization factor (VF) is bigger than the number
5124      of elements that we can fit in a vectype (nunits), we have to generate
5125      more than one vector stmt - i.e - we need to "unroll" the
5126      vector stmt by a factor VF/nunits.  For more details see documentation
5127      in vectorizable_operation.  */
5128
5129   /* If the reduction is used in an outer loop we need to generate
5130      VF intermediate results, like so (e.g. for ncopies=2):
5131         r0 = phi (init, r0)
5132         r1 = phi (init, r1)
5133         r0 = x0 + r0;
5134         r1 = x1 + r1;
5135     (i.e. we generate VF results in 2 registers).
5136     In this case we have a separate def-use cycle for each copy, and therefore
5137     for each copy we get the vector def for the reduction variable from the
5138     respective phi node created for this copy.
5139
5140     Otherwise (the reduction is unused in the loop nest), we can combine
5141     together intermediate results, like so (e.g. for ncopies=2):
5142         r = phi (init, r)
5143         r = x0 + r;
5144         r = x1 + r;
5145    (i.e. we generate VF/2 results in a single register).
5146    In this case for each copy we get the vector def for the reduction variable
5147    from the vectorized reduction operation generated in the previous iteration.
5148   */
5149
5150   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5151     {
5152       single_defuse_cycle = true;
5153       epilog_copies = 1;
5154     }
5155   else
5156     epilog_copies = ncopies;
5157
5158   prev_stmt_info = NULL;
5159   prev_phi_info = NULL;
5160   if (slp_node)
5161     {
5162       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5163       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5164                   == TYPE_VECTOR_SUBPARTS (vectype_in));
5165     }
5166   else
5167     {
5168       vec_num = 1;
5169       vec_oprnds0.create (1);
5170       if (op_type == ternary_op)
5171         vec_oprnds1.create (1);
5172     }
5173
5174   phis.create (vec_num);
5175   vect_defs.create (vec_num);
5176   if (!slp_node)
5177     vect_defs.quick_push (NULL_TREE);
5178
5179   for (j = 0; j < ncopies; j++)
5180     {
5181       if (j == 0 || !single_defuse_cycle)
5182         {
5183           for (i = 0; i < vec_num; i++)
5184             {
5185               /* Create the reduction-phi that defines the reduction
5186                  operand.  */
5187               new_phi = create_phi_node (vec_dest, loop->header);
5188               set_vinfo_for_stmt (new_phi,
5189                                   new_stmt_vec_info (new_phi, loop_vinfo,
5190                                                      NULL));
5191                if (j == 0 || slp_node)
5192                  phis.quick_push (new_phi);
5193             }
5194         }
5195
5196       if (code == COND_EXPR)
5197         {
5198           gcc_assert (!slp_node);
5199           vectorizable_condition (stmt, gsi, vec_stmt,
5200                                   PHI_RESULT (phis[0]),
5201                                   reduc_index, NULL);
5202           /* Multiple types are not supported for condition.  */
5203           break;
5204         }
5205
5206       /* Handle uses.  */
5207       if (j == 0)
5208         {
5209           op0 = ops[!reduc_index];
5210           if (op_type == ternary_op)
5211             {
5212               if (reduc_index == 0)
5213                 op1 = ops[2];
5214               else
5215                 op1 = ops[1];
5216             }
5217
5218           if (slp_node)
5219             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5220                                slp_node, -1);
5221           else
5222             {
5223               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5224                                                             stmt, NULL);
5225               vec_oprnds0.quick_push (loop_vec_def0);
5226               if (op_type == ternary_op)
5227                {
5228                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5229                                                                NULL);
5230                  vec_oprnds1.quick_push (loop_vec_def1);
5231                }
5232             }
5233         }
5234       else
5235         {
5236           if (!slp_node)
5237             {
5238               enum vect_def_type dt;
5239               gimple dummy_stmt;
5240               tree dummy;
5241
5242               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5243                                   &dummy_stmt, &dummy, &dt);
5244               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5245                                                               loop_vec_def0);
5246               vec_oprnds0[0] = loop_vec_def0;
5247               if (op_type == ternary_op)
5248                 {
5249                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5250                                       &dummy, &dt);
5251                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5252                                                                 loop_vec_def1);
5253                   vec_oprnds1[0] = loop_vec_def1;
5254                 }
5255             }
5256
5257           if (single_defuse_cycle)
5258             reduc_def = gimple_assign_lhs (new_stmt);
5259
5260           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5261         }
5262
5263       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5264         {
5265           if (slp_node)
5266             reduc_def = PHI_RESULT (phis[i]);
5267           else
5268             {
5269               if (!single_defuse_cycle || j == 0)
5270                 reduc_def = PHI_RESULT (new_phi);
5271             }
5272
5273           def1 = ((op_type == ternary_op)
5274                   ? vec_oprnds1[i] : NULL);
5275           if (op_type == binary_op)
5276             {
5277               if (reduc_index == 0)
5278                 expr = build2 (code, vectype_out, reduc_def, def0);
5279               else
5280                 expr = build2 (code, vectype_out, def0, reduc_def);
5281             }
5282           else
5283             {
5284               if (reduc_index == 0)
5285                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5286               else
5287                 {
5288                   if (reduc_index == 1)
5289                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5290                   else
5291                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5292                 }
5293             }
5294
5295           new_stmt = gimple_build_assign (vec_dest, expr);
5296           new_temp = make_ssa_name (vec_dest, new_stmt);
5297           gimple_assign_set_lhs (new_stmt, new_temp);
5298           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5299
5300           if (slp_node)
5301             {
5302               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5303               vect_defs.quick_push (new_temp);
5304             }
5305           else
5306             vect_defs[0] = new_temp;
5307         }
5308
5309       if (slp_node)
5310         continue;
5311
5312       if (j == 0)
5313         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5314       else
5315         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5316
5317       prev_stmt_info = vinfo_for_stmt (new_stmt);
5318       prev_phi_info = vinfo_for_stmt (new_phi);
5319     }
5320
5321   /* Finalize the reduction-phi (set its arguments) and create the
5322      epilog reduction code.  */
5323   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5324     {
5325       new_temp = gimple_assign_lhs (*vec_stmt);
5326       vect_defs[0] = new_temp;
5327     }
5328
5329   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5330                                     epilog_reduc_code, phis, reduc_index,
5331                                     double_reduc, slp_node);
5332
5333   return true;
5334 }
5335
5336 /* Function vect_min_worthwhile_factor.
5337
5338    For a loop where we could vectorize the operation indicated by CODE,
5339    return the minimum vectorization factor that makes it worthwhile
5340    to use generic vectors.  */
5341 int
5342 vect_min_worthwhile_factor (enum tree_code code)
5343 {
5344   switch (code)
5345     {
5346     case PLUS_EXPR:
5347     case MINUS_EXPR:
5348     case NEGATE_EXPR:
5349       return 4;
5350
5351     case BIT_AND_EXPR:
5352     case BIT_IOR_EXPR:
5353     case BIT_XOR_EXPR:
5354     case BIT_NOT_EXPR:
5355       return 2;
5356
5357     default:
5358       return INT_MAX;
5359     }
5360 }
5361
5362
5363 /* Function vectorizable_induction
5364
5365    Check if PHI performs an induction computation that can be vectorized.
5366    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5367    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5368    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5369
5370 bool
5371 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5372                         gimple *vec_stmt)
5373 {
5374   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5375   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5376   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5377   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5378   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5379   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5380   tree vec_def;
5381
5382   gcc_assert (ncopies >= 1);
5383   /* FORNOW. These restrictions should be relaxed.  */
5384   if (nested_in_vect_loop_p (loop, phi))
5385     {
5386       imm_use_iterator imm_iter;
5387       use_operand_p use_p;
5388       gimple exit_phi;
5389       edge latch_e;
5390       tree loop_arg;
5391
5392       if (ncopies > 1)
5393         {
5394           if (dump_enabled_p ())
5395             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5396                              "multiple types in nested loop.\n");
5397           return false;
5398         }
5399
5400       exit_phi = NULL;
5401       latch_e = loop_latch_edge (loop->inner);
5402       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5403       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5404         {
5405           if (!flow_bb_inside_loop_p (loop->inner,
5406                                       gimple_bb (USE_STMT (use_p))))
5407             {
5408               exit_phi = USE_STMT (use_p);
5409               break;
5410             }
5411         }
5412       if (exit_phi)
5413         {
5414           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5415           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5416                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5417             {
5418               if (dump_enabled_p ())
5419                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5420                                  "inner-loop induction only used outside "
5421                                  "of the outer vectorized loop.\n");
5422               return false;
5423             }
5424         }
5425     }
5426
5427   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5428     return false;
5429
5430   /* FORNOW: SLP not supported.  */
5431   if (STMT_SLP_TYPE (stmt_info))
5432     return false;
5433
5434   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5435
5436   if (gimple_code (phi) != GIMPLE_PHI)
5437     return false;
5438
5439   if (!vec_stmt) /* transformation not required.  */
5440     {
5441       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5442       if (dump_enabled_p ())
5443         dump_printf_loc (MSG_NOTE, vect_location,
5444                          "=== vectorizable_induction ===\n");
5445       vect_model_induction_cost (stmt_info, ncopies);
5446       return true;
5447     }
5448
5449   /** Transform.  **/
5450
5451   if (dump_enabled_p ())
5452     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5453
5454   vec_def = get_initial_def_for_induction (phi);
5455   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5456   return true;
5457 }
5458
5459 /* Function vectorizable_live_operation.
5460
5461    STMT computes a value that is used outside the loop.  Check if
5462    it can be supported.  */
5463
5464 bool
5465 vectorizable_live_operation (gimple stmt,
5466                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5467                              gimple *vec_stmt)
5468 {
5469   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5470   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5471   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5472   int i;
5473   int op_type;
5474   tree op;
5475   tree def;
5476   gimple def_stmt;
5477   enum vect_def_type dt;
5478   enum tree_code code;
5479   enum gimple_rhs_class rhs_class;
5480
5481   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5482
5483   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5484     return false;
5485
5486   if (!is_gimple_assign (stmt))
5487     {
5488       if (gimple_call_internal_p (stmt)
5489           && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5490           && gimple_call_lhs (stmt)
5491           && loop->simduid
5492           && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5493           && loop->simduid
5494              == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5495         {
5496           edge e = single_exit (loop);
5497           basic_block merge_bb = e->dest;
5498           imm_use_iterator imm_iter;
5499           use_operand_p use_p;
5500           tree lhs = gimple_call_lhs (stmt);
5501
5502           FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5503             {
5504               gimple use_stmt = USE_STMT (use_p);
5505               if (gimple_code (use_stmt) == GIMPLE_PHI
5506                   || gimple_bb (use_stmt) == merge_bb)
5507                 {
5508                   if (vec_stmt)
5509                     {
5510                       tree vfm1
5511                         = build_int_cst (unsigned_type_node,
5512                                          loop_vinfo->vectorization_factor - 1);
5513                       SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5514                     }
5515                   return true;
5516                 }
5517             }
5518         }
5519
5520       return false;
5521     }
5522
5523   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5524     return false;
5525
5526   /* FORNOW. CHECKME. */
5527   if (nested_in_vect_loop_p (loop, stmt))
5528     return false;
5529
5530   code = gimple_assign_rhs_code (stmt);
5531   op_type = TREE_CODE_LENGTH (code);
5532   rhs_class = get_gimple_rhs_class (code);
5533   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5534   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5535
5536   /* FORNOW: support only if all uses are invariant.  This means
5537      that the scalar operations can remain in place, unvectorized.
5538      The original last scalar value that they compute will be used.  */
5539
5540   for (i = 0; i < op_type; i++)
5541     {
5542       if (rhs_class == GIMPLE_SINGLE_RHS)
5543         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5544       else
5545         op = gimple_op (stmt, i + 1);
5546       if (op
5547           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5548                                   &dt))
5549         {
5550           if (dump_enabled_p ())
5551             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5552                              "use not simple.\n");
5553           return false;
5554         }
5555
5556       if (dt != vect_external_def && dt != vect_constant_def)
5557         return false;
5558     }
5559
5560   /* No transformation is required for the cases we currently support.  */
5561   return true;
5562 }
5563
5564 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5565
5566 static void
5567 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5568 {
5569   ssa_op_iter op_iter;
5570   imm_use_iterator imm_iter;
5571   def_operand_p def_p;
5572   gimple ustmt;
5573
5574   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5575     {
5576       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5577         {
5578           basic_block bb;
5579
5580           if (!is_gimple_debug (ustmt))
5581             continue;
5582
5583           bb = gimple_bb (ustmt);
5584
5585           if (!flow_bb_inside_loop_p (loop, bb))
5586             {
5587               if (gimple_debug_bind_p (ustmt))
5588                 {
5589                   if (dump_enabled_p ())
5590                     dump_printf_loc (MSG_NOTE, vect_location,
5591                                      "killing debug use\n");
5592
5593                   gimple_debug_bind_reset_value (ustmt);
5594                   update_stmt (ustmt);
5595                 }
5596               else
5597                 gcc_unreachable ();
5598             }
5599         }
5600     }
5601 }
5602
5603
5604 /* This function builds ni_name = number of iterations.  Statements
5605    are emitted on the loop preheader edge.  */
5606
5607 static tree
5608 vect_build_loop_niters (loop_vec_info loop_vinfo)
5609 {
5610   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5611   if (TREE_CODE (ni) == INTEGER_CST)
5612     return ni;
5613   else
5614     {
5615       tree ni_name, var;
5616       gimple_seq stmts = NULL;
5617       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5618
5619       var = create_tmp_var (TREE_TYPE (ni), "niters");
5620       ni_name = force_gimple_operand (ni, &stmts, false, var);
5621       if (stmts)
5622         gsi_insert_seq_on_edge_immediate (pe, stmts);
5623
5624       return ni_name;
5625     }
5626 }
5627
5628
5629 /* This function generates the following statements:
5630
5631    ni_name = number of iterations loop executes
5632    ratio = ni_name / vf
5633    ratio_mult_vf_name = ratio * vf
5634
5635    and places them on the loop preheader edge.  */
5636
5637 static void
5638 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5639                                  tree ni_name,
5640                                  tree *ratio_mult_vf_name_ptr,
5641                                  tree *ratio_name_ptr)
5642 {
5643   tree ni_minus_gap_name;
5644   tree var;
5645   tree ratio_name;
5646   tree ratio_mult_vf_name;
5647   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5648   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5649   tree log_vf;
5650
5651   log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5652
5653   /* If epilogue loop is required because of data accesses with gaps, we
5654      subtract one iteration from the total number of iterations here for
5655      correct calculation of RATIO.  */
5656   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5657     {
5658       ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5659                                        ni_name,
5660                                        build_one_cst (TREE_TYPE (ni_name)));
5661       if (!is_gimple_val (ni_minus_gap_name))
5662         {
5663           var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5664           gimple stmts = NULL;
5665           ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5666                                                     true, var);
5667           gsi_insert_seq_on_edge_immediate (pe, stmts);
5668         }
5669     }
5670   else
5671     ni_minus_gap_name = ni_name;
5672
5673   /* Create: ratio = ni >> log2(vf) */
5674   /* ???  As we have ni == number of latch executions + 1, ni could
5675      have overflown to zero.  So avoid computing ratio based on ni
5676      but compute it using the fact that we know ratio will be at least
5677      one, thus via (ni - vf) >> log2(vf) + 1.  */
5678   ratio_name
5679     = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5680                    fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5681                                 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5682                                              ni_minus_gap_name,
5683                                              build_int_cst
5684                                                (TREE_TYPE (ni_name), vf)),
5685                                 log_vf),
5686                    build_int_cst (TREE_TYPE (ni_name), 1));
5687   if (!is_gimple_val (ratio_name))
5688     {
5689       var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5690       gimple stmts = NULL;
5691       ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5692       gsi_insert_seq_on_edge_immediate (pe, stmts);
5693     }
5694   *ratio_name_ptr = ratio_name;
5695
5696   /* Create: ratio_mult_vf = ratio << log2 (vf).  */
5697
5698   if (ratio_mult_vf_name_ptr)
5699     {
5700       ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5701                                         ratio_name, log_vf);
5702       if (!is_gimple_val (ratio_mult_vf_name))
5703         {
5704           var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5705           gimple stmts = NULL;
5706           ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5707                                                      true, var);
5708           gsi_insert_seq_on_edge_immediate (pe, stmts);
5709         }
5710       *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5711     }
5712
5713   return;
5714 }
5715
5716
5717 /* Function vect_transform_loop.
5718
5719    The analysis phase has determined that the loop is vectorizable.
5720    Vectorize the loop - created vectorized stmts to replace the scalar
5721    stmts in the loop, and update the loop exit condition.  */
5722
5723 void
5724 vect_transform_loop (loop_vec_info loop_vinfo)
5725 {
5726   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5727   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5728   int nbbs = loop->num_nodes;
5729   gimple_stmt_iterator si;
5730   int i;
5731   tree ratio = NULL;
5732   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5733   bool grouped_store;
5734   bool slp_scheduled = false;
5735   gimple stmt, pattern_stmt;
5736   gimple_seq pattern_def_seq = NULL;
5737   gimple_stmt_iterator pattern_def_si = gsi_none ();
5738   bool transform_pattern_stmt = false;
5739   bool check_profitability = false;
5740   int th;
5741   /* Record number of iterations before we started tampering with the profile. */
5742   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5743
5744   if (dump_enabled_p ())
5745     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5746
5747   /* If profile is inprecise, we have chance to fix it up.  */
5748   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5749     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5750
5751   /* Use the more conservative vectorization threshold.  If the number
5752      of iterations is constant assume the cost check has been performed
5753      by our caller.  If the threshold makes all loops profitable that
5754      run at least the vectorization factor number of times checking
5755      is pointless, too.  */
5756   th = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
5757          * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
5758   th = MAX (th, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo));
5759   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5760       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5761     {
5762       if (dump_enabled_p ())
5763         dump_printf_loc (MSG_NOTE, vect_location,
5764                          "Profitability threshold is %d loop iterations.\n",
5765                          th);
5766       check_profitability = true;
5767     }
5768
5769   /* Version the loop first, if required, so the profitability check
5770      comes first.  */
5771
5772   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5773       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5774     {
5775       vect_loop_versioning (loop_vinfo, th, check_profitability);
5776       check_profitability = false;
5777     }
5778
5779   tree ni_name = vect_build_loop_niters (loop_vinfo);
5780   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5781
5782   /* Peel the loop if there are data refs with unknown alignment.
5783      Only one data ref with unknown store is allowed.  */
5784
5785   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5786     {
5787       vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5788                                      th, check_profitability);
5789       check_profitability = false;
5790       /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5791          be re-computed.  */
5792       ni_name = NULL_TREE;
5793     }
5794
5795   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5796      compile time constant), or it is a constant that doesn't divide by the
5797      vectorization factor, then an epilog loop needs to be created.
5798      We therefore duplicate the loop: the original loop will be vectorized,
5799      and will compute the first (n/VF) iterations.  The second copy of the loop
5800      will remain scalar and will compute the remaining (n%VF) iterations.
5801      (VF is the vectorization factor).  */
5802
5803   if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
5804       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5805     {
5806       tree ratio_mult_vf;
5807       if (!ni_name)
5808         ni_name = vect_build_loop_niters (loop_vinfo);
5809       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
5810                                        &ratio);
5811       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
5812                                       th, check_profitability);
5813     }
5814   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5815     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5816                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5817   else
5818     {
5819       if (!ni_name)
5820         ni_name = vect_build_loop_niters (loop_vinfo);
5821       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
5822     }
5823
5824   /* 1) Make sure the loop header has exactly two entries
5825      2) Make sure we have a preheader basic block.  */
5826
5827   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5828
5829   split_edge (loop_preheader_edge (loop));
5830
5831   /* FORNOW: the vectorizer supports only loops which body consist
5832      of one basic block (header + empty latch). When the vectorizer will
5833      support more involved loop forms, the order by which the BBs are
5834      traversed need to be reconsidered.  */
5835
5836   for (i = 0; i < nbbs; i++)
5837     {
5838       basic_block bb = bbs[i];
5839       stmt_vec_info stmt_info;
5840       gimple phi;
5841
5842       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5843         {
5844           phi = gsi_stmt (si);
5845           if (dump_enabled_p ())
5846             {
5847               dump_printf_loc (MSG_NOTE, vect_location,
5848                                "------>vectorizing phi: ");
5849               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5850               dump_printf (MSG_NOTE, "\n");
5851             }
5852           stmt_info = vinfo_for_stmt (phi);
5853           if (!stmt_info)
5854             continue;
5855
5856           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5857             vect_loop_kill_debug_uses (loop, phi);
5858
5859           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5860               && !STMT_VINFO_LIVE_P (stmt_info))
5861             continue;
5862
5863           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5864                 != (unsigned HOST_WIDE_INT) vectorization_factor)
5865               && dump_enabled_p ())
5866             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
5867
5868           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5869             {
5870               if (dump_enabled_p ())
5871                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
5872               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5873             }
5874         }
5875
5876       pattern_stmt = NULL;
5877       for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5878         {
5879           bool is_store;
5880
5881           if (transform_pattern_stmt)
5882             stmt = pattern_stmt;
5883           else
5884             {
5885               stmt = gsi_stmt (si);
5886               /* During vectorization remove existing clobber stmts.  */
5887               if (gimple_clobber_p (stmt))
5888                 {
5889                   unlink_stmt_vdef (stmt);
5890                   gsi_remove (&si, true);
5891                   release_defs (stmt);
5892                   continue;
5893                 }
5894             }
5895
5896           if (dump_enabled_p ())
5897             {
5898               dump_printf_loc (MSG_NOTE, vect_location,
5899                                "------>vectorizing statement: ");
5900               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5901               dump_printf (MSG_NOTE, "\n");
5902             }
5903
5904           stmt_info = vinfo_for_stmt (stmt);
5905
5906           /* vector stmts created in the outer-loop during vectorization of
5907              stmts in an inner-loop may not have a stmt_info, and do not
5908              need to be vectorized.  */
5909           if (!stmt_info)
5910             {
5911               gsi_next (&si);
5912               continue;
5913             }
5914
5915           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5916             vect_loop_kill_debug_uses (loop, stmt);
5917
5918           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5919               && !STMT_VINFO_LIVE_P (stmt_info))
5920             {
5921               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5922                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5923                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5924                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5925                 {
5926                   stmt = pattern_stmt;
5927                   stmt_info = vinfo_for_stmt (stmt);
5928                 }
5929               else
5930                 {
5931                   gsi_next (&si);
5932                   continue;
5933                 }
5934             }
5935           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5936                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5937                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5938                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5939             transform_pattern_stmt = true;
5940
5941           /* If pattern statement has def stmts, vectorize them too.  */
5942           if (is_pattern_stmt_p (stmt_info))
5943             {
5944               if (pattern_def_seq == NULL)
5945                 {
5946                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
5947                   pattern_def_si = gsi_start (pattern_def_seq);
5948                 }
5949               else if (!gsi_end_p (pattern_def_si))
5950                 gsi_next (&pattern_def_si);
5951               if (pattern_def_seq != NULL)
5952                 {
5953                   gimple pattern_def_stmt = NULL;
5954                   stmt_vec_info pattern_def_stmt_info = NULL;
5955
5956                   while (!gsi_end_p (pattern_def_si))
5957                     {
5958                       pattern_def_stmt = gsi_stmt (pattern_def_si);
5959                       pattern_def_stmt_info
5960                         = vinfo_for_stmt (pattern_def_stmt);
5961                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
5962                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
5963                         break;
5964                       gsi_next (&pattern_def_si);
5965                     }
5966
5967                   if (!gsi_end_p (pattern_def_si))
5968                     {
5969                       if (dump_enabled_p ())
5970                         {
5971                           dump_printf_loc (MSG_NOTE, vect_location,
5972                                            "==> vectorizing pattern def "
5973                                            "stmt: ");
5974                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
5975                                             pattern_def_stmt, 0);
5976                           dump_printf (MSG_NOTE, "\n");
5977                         }
5978
5979                       stmt = pattern_def_stmt;
5980                       stmt_info = pattern_def_stmt_info;
5981                     }
5982                   else
5983                     {
5984                       pattern_def_si = gsi_none ();
5985                       transform_pattern_stmt = false;
5986                     }
5987                 }
5988               else
5989                 transform_pattern_stmt = false;
5990             }
5991
5992           if (STMT_VINFO_VECTYPE (stmt_info))
5993             {
5994               unsigned int nunits
5995                 = (unsigned int)
5996                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
5997               if (!STMT_SLP_TYPE (stmt_info)
5998                   && nunits != (unsigned int) vectorization_factor
5999                   && dump_enabled_p ())
6000                   /* For SLP VF is set according to unrolling factor, and not
6001                      to vector size, hence for SLP this print is not valid.  */
6002                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6003             }
6004
6005           /* SLP. Schedule all the SLP instances when the first SLP stmt is
6006              reached.  */
6007           if (STMT_SLP_TYPE (stmt_info))
6008             {
6009               if (!slp_scheduled)
6010                 {
6011                   slp_scheduled = true;
6012
6013                   if (dump_enabled_p ())
6014                     dump_printf_loc (MSG_NOTE, vect_location,
6015                                      "=== scheduling SLP instances ===\n");
6016
6017                   vect_schedule_slp (loop_vinfo, NULL);
6018                 }
6019
6020               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
6021               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6022                 {
6023                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6024                     {
6025                       pattern_def_seq = NULL;
6026                       gsi_next (&si);
6027                     }
6028                   continue;
6029                 }
6030             }
6031
6032           /* -------- vectorize statement ------------ */
6033           if (dump_enabled_p ())
6034             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6035
6036           grouped_store = false;
6037           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6038           if (is_store)
6039             {
6040               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6041                 {
6042                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6043                      interleaving chain was completed - free all the stores in
6044                      the chain.  */
6045                   gsi_next (&si);
6046                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6047                   continue;
6048                 }
6049               else
6050                 {
6051                   /* Free the attached stmt_vec_info and remove the stmt.  */
6052                   gimple store = gsi_stmt (si);
6053                   free_stmt_vec_info (store);
6054                   unlink_stmt_vdef (store);
6055                   gsi_remove (&si, true);
6056                   release_defs (store);
6057                   continue;
6058                 }
6059             }
6060
6061           if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6062             {
6063               pattern_def_seq = NULL;
6064               gsi_next (&si);
6065             }
6066         }                       /* stmts in BB */
6067     }                           /* BBs in loop */
6068
6069   slpeel_make_loop_iterate_ntimes (loop, ratio);
6070
6071   /* Reduce loop iterations by the vectorization factor.  */
6072   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6073                       expected_iterations / vectorization_factor);
6074   loop->nb_iterations_upper_bound
6075     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (vectorization_factor),
6076                                             FLOOR_DIV_EXPR);
6077   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6078       && loop->nb_iterations_upper_bound != double_int_zero)
6079     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - double_int_one;
6080   if (loop->any_estimate)
6081     {
6082       loop->nb_iterations_estimate
6083         = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (vectorization_factor),
6084                                              FLOOR_DIV_EXPR);
6085        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6086            && loop->nb_iterations_estimate != double_int_zero)
6087          loop->nb_iterations_estimate = loop->nb_iterations_estimate - double_int_one;
6088     }
6089
6090   if (dump_enabled_p ())
6091     {
6092       dump_printf_loc (MSG_NOTE, vect_location,
6093                        "LOOP VECTORIZED\n");
6094       if (loop->inner)
6095         dump_printf_loc (MSG_NOTE, vect_location,
6096                          "OUTER LOOP VECTORIZED\n");
6097       dump_printf (MSG_NOTE, "\n");
6098     }
6099 }