gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2015 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "cfgloop.h"
  45 #include "params.h"
  46 #include "tree-scalar-evolution.h"
  47 #include "tree-vectorizer.h"
  48 #include "gimple-fold.h"
  49 #include "cgraph.h"
  50
  51 /* Loop Vectorization Pass.
  52
  53    This pass tries to vectorize loops.
  54
  55    For example, the vectorizer transforms the following simple loop:
  56
  57         short a[N]; short b[N]; short c[N]; int i;
  58
  59         for (i=0; i<N; i++){
  60           a[i] = b[i] + c[i];
  61         }
  62
  63    as if it was manually vectorized by rewriting the source code into:
  64
  65         typedef int __attribute__((mode(V8HI))) v8hi;
  66         short a[N];  short b[N]; short c[N];   int i;
  67         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  68         v8hi va, vb, vc;
  69
  70         for (i=0; i<N/8; i++){
  71           vb = pb[i];
  72           vc = pc[i];
  73           va = vb + vc;
  74           pa[i] = va;
  75         }
  76
  77         The main entry to this pass is vectorize_loops(), in which
  78    the vectorizer applies a set of analyses on a given set of loops,
  79    followed by the actual vectorization transformation for the loops that
  80    had successfully passed the analysis phase.
  81         Throughout this pass we make a distinction between two types of
  82    data: scalars (which are represented by SSA_NAMES), and memory references
  83    ("data-refs").  These two types of data require different handling both
  84    during analysis and transformation. The types of data-refs that the
  85    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  86    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  87    accesses are required to have a simple (consecutive) access pattern.
  88
  89    Analysis phase:
  90    ===============
  91         The driver for the analysis phase is vect_analyze_loop().
  92    It applies a set of analyses, some of which rely on the scalar evolution
  93    analyzer (scev) developed by Sebastian Pop.
  94
  95         During the analysis phase the vectorizer records some information
  96    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  97    loop, as well as general information about the loop as a whole, which is
  98    recorded in a "loop_vec_info" struct attached to each loop.
  99
 100    Transformation phase:
 101    =====================
 102         The loop transformation phase scans all the stmts in the loop, and
 103    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 104    the loop that needs to be vectorized.  It inserts the vector code sequence
 105    just before the scalar stmt S, and records a pointer to the vector code
 106    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 107    attached to S).  This pointer will be used for the vectorization of following
 108    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 109    otherwise, we rely on dead code elimination for removing it.
 110
 111         For example, say stmt S1 was vectorized into stmt VS1:
 112
 113    VS1: vb = px[i];
 114    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 115    S2:  a = b;
 116
 117    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 118    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 119    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 120    resulting sequence would be:
 121
 122    VS1: vb = px[i];
 123    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 124    VS2: va = vb;
 125    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 126
 127         Operands that are not SSA_NAMEs, are data-refs that appear in
 128    load/store operations (like 'x[i]' in S1), and are handled differently.
 129
 130    Target modeling:
 131    =================
 132         Currently the only target specific information that is used is the
 133    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 134    Targets that can support different sizes of vectors, for now will need
 135    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 136    flexibility will be added in the future.
 137
 138         Since we only vectorize operations which vector form can be
 139    expressed using existing tree codes, to verify that an operation is
 140    supported, the vectorizer checks the relevant optab at the relevant
 141    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 142    the value found is CODE_FOR_nothing, then there's no target support, and
 143    we can't vectorize the stmt.
 144
 145    For additional information on this project see:
 146    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 147 */
 148
 149 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 150
 151 /* Function vect_determine_vectorization_factor
 152
 153    Determine the vectorization factor (VF).  VF is the number of data elements
 154    that are operated upon in parallel in a single iteration of the vectorized
 155    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 156    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 157    elements can fit in a single vector register.
 158
 159    We currently support vectorization of loops in which all types operated upon
 160    are of the same size.  Therefore this function currently sets VF according to
 161    the size of the types operated upon, and fails if there are multiple sizes
 162    in the loop.
 163
 164    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 165    original loop:
 166         for (i=0; i<N; i++){
 167           a[i] = b[i] + c[i];
 168         }
 169
 170    vectorized loop:
 171         for (i=0; i<N; i+=VF){
 172           a[i:VF] = b[i:VF] + c[i:VF];
 173         }
 174 */
 175
 176 static bool
 177 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 178 {
 179   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 180   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 181   int nbbs = loop->num_nodes;
 182   unsigned int vectorization_factor = 0;
 183   tree scalar_type;
 184   gphi *phi;
 185   tree vectype;
 186   unsigned int nunits;
 187   stmt_vec_info stmt_info;
 188   int i;
 189   HOST_WIDE_INT dummy;
 190   gimple *stmt, *pattern_stmt = NULL;
 191   gimple_seq pattern_def_seq = NULL;
 192   gimple_stmt_iterator pattern_def_si = gsi_none ();
 193   bool analyze_pattern_stmt = false;
 194
 195   if (dump_enabled_p ())
 196     dump_printf_loc (MSG_NOTE, vect_location,
 197                      "=== vect_determine_vectorization_factor ===\n");
 198
 199   for (i = 0; i < nbbs; i++)
 200     {
 201       basic_block bb = bbs[i];
 202
 203       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 204            gsi_next (&si))
 205         {
 206           phi = si.phi ();
 207           stmt_info = vinfo_for_stmt (phi);
 208           if (dump_enabled_p ())
 209             {
 210               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 211               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 212               dump_printf (MSG_NOTE, "\n");
 213             }
 214
 215           gcc_assert (stmt_info);
 216
 217           if (STMT_VINFO_RELEVANT_P (stmt_info))
 218             {
 219               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 220               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 221
 222               if (dump_enabled_p ())
 223                 {
 224                   dump_printf_loc (MSG_NOTE, vect_location,
 225                                    "get vectype for scalar type:  ");
 226                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 227                   dump_printf (MSG_NOTE, "\n");
 228                 }
 229
 230               vectype = get_vectype_for_scalar_type (scalar_type);
 231               if (!vectype)
 232                 {
 233                   if (dump_enabled_p ())
 234                     {
 235                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 236                                        "not vectorized: unsupported "
 237                                        "data-type ");
 238                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 239                                          scalar_type);
 240                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 241                     }
 242                   return false;
 243                 }
 244               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 245
 246               if (dump_enabled_p ())
 247                 {
 248                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 249                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 250                   dump_printf (MSG_NOTE, "\n");
 251                 }
 252
 253               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 254               if (dump_enabled_p ())
 255                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 256                                  nunits);
 257
 258               if (!vectorization_factor
 259                   || (nunits > vectorization_factor))
 260                 vectorization_factor = nunits;
 261             }
 262         }
 263
 264       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 265            !gsi_end_p (si) || analyze_pattern_stmt;)
 266         {
 267           tree vf_vectype;
 268
 269           if (analyze_pattern_stmt)
 270             stmt = pattern_stmt;
 271           else
 272             stmt = gsi_stmt (si);
 273
 274           stmt_info = vinfo_for_stmt (stmt);
 275
 276           if (dump_enabled_p ())
 277             {
 278               dump_printf_loc (MSG_NOTE, vect_location,
 279                                "==> examining statement: ");
 280               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 281               dump_printf (MSG_NOTE, "\n");
 282             }
 283
 284           gcc_assert (stmt_info);
 285
 286           /* Skip stmts which do not need to be vectorized.  */
 287           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 288                && !STMT_VINFO_LIVE_P (stmt_info))
 289               || gimple_clobber_p (stmt))
 290             {
 291               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 292                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 293                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 294                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 295                 {
 296                   stmt = pattern_stmt;
 297                   stmt_info = vinfo_for_stmt (pattern_stmt);
 298                   if (dump_enabled_p ())
 299                     {
 300                       dump_printf_loc (MSG_NOTE, vect_location,
 301                                        "==> examining pattern statement: ");
 302                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 303                       dump_printf (MSG_NOTE, "\n");
 304                     }
 305                 }
 306               else
 307                 {
 308                   if (dump_enabled_p ())
 309                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 310                   gsi_next (&si);
 311                   continue;
 312                 }
 313             }
 314           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 315                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 316                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 317                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 318             analyze_pattern_stmt = true;
 319
 320           /* If a pattern statement has def stmts, analyze them too.  */
 321           if (is_pattern_stmt_p (stmt_info))
 322             {
 323               if (pattern_def_seq == NULL)
 324                 {
 325                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 326                   pattern_def_si = gsi_start (pattern_def_seq);
 327                 }
 328               else if (!gsi_end_p (pattern_def_si))
 329                 gsi_next (&pattern_def_si);
 330               if (pattern_def_seq != NULL)
 331                 {
 332                   gimple *pattern_def_stmt = NULL;
 333                   stmt_vec_info pattern_def_stmt_info = NULL;
 334
 335                   while (!gsi_end_p (pattern_def_si))
 336                     {
 337                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 338                       pattern_def_stmt_info
 339                         = vinfo_for_stmt (pattern_def_stmt);
 340                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 341                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 342                         break;
 343                       gsi_next (&pattern_def_si);
 344                     }
 345
 346                   if (!gsi_end_p (pattern_def_si))
 347                     {
 348                       if (dump_enabled_p ())
 349                         {
 350                           dump_printf_loc (MSG_NOTE, vect_location,
 351                                            "==> examining pattern def stmt: ");
 352                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 353                                             pattern_def_stmt, 0);
 354                           dump_printf (MSG_NOTE, "\n");
 355                         }
 356
 357                       stmt = pattern_def_stmt;
 358                       stmt_info = pattern_def_stmt_info;
 359                     }
 360                   else
 361                     {
 362                       pattern_def_si = gsi_none ();
 363                       analyze_pattern_stmt = false;
 364                     }
 365                 }
 366               else
 367                 analyze_pattern_stmt = false;
 368             }
 369
 370           if (gimple_get_lhs (stmt) == NULL_TREE
 371               /* MASK_STORE has no lhs, but is ok.  */
 372               && (!is_gimple_call (stmt)
 373                   || !gimple_call_internal_p (stmt)
 374                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 375             {
 376               if (is_gimple_call (stmt))
 377                 {
 378                   /* Ignore calls with no lhs.  These must be calls to
 379                      #pragma omp simd functions, and what vectorization factor
 380                      it really needs can't be determined until
 381                      vectorizable_simd_clone_call.  */
 382                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 383                     {
 384                       pattern_def_seq = NULL;
 385                       gsi_next (&si);
 386                     }
 387                   continue;
 388                 }
 389               if (dump_enabled_p ())
 390                 {
 391                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 392                                    "not vectorized: irregular stmt.");
 393                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 394                                     0);
 395                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 396                 }
 397               return false;
 398             }
 399
 400           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 401             {
 402               if (dump_enabled_p ())
 403                 {
 404                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 405                                    "not vectorized: vector stmt in loop:");
 406                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 407                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 408                 }
 409               return false;
 410             }
 411
 412           if (STMT_VINFO_VECTYPE (stmt_info))
 413             {
 414               /* The only case when a vectype had been already set is for stmts
 415                  that contain a dataref, or for "pattern-stmts" (stmts
 416                  generated by the vectorizer to represent/replace a certain
 417                  idiom).  */
 418               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 419                           || is_pattern_stmt_p (stmt_info)
 420                           || !gsi_end_p (pattern_def_si));
 421               vectype = STMT_VINFO_VECTYPE (stmt_info);
 422             }
 423           else
 424             {
 425               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 426               if (is_gimple_call (stmt)
 427                   && gimple_call_internal_p (stmt)
 428                   && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
 429                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 430               else
 431                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 432               if (dump_enabled_p ())
 433                 {
 434                   dump_printf_loc (MSG_NOTE, vect_location,
 435                                    "get vectype for scalar type:  ");
 436                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 437                   dump_printf (MSG_NOTE, "\n");
 438                 }
 439               vectype = get_vectype_for_scalar_type (scalar_type);
 440               if (!vectype)
 441                 {
 442                   if (dump_enabled_p ())
 443                     {
 444                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 445                                        "not vectorized: unsupported "
 446                                        "data-type ");
 447                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 448                                          scalar_type);
 449                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 450                     }
 451                   return false;
 452                 }
 453
 454               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 455
 456               if (dump_enabled_p ())
 457                 {
 458                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 459                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 460                   dump_printf (MSG_NOTE, "\n");
 461                 }
 462             }
 463
 464           /* The vectorization factor is according to the smallest
 465              scalar type (or the largest vector size, but we only
 466              support one vector size per loop).  */
 467           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 468                                                        &dummy);
 469           if (dump_enabled_p ())
 470             {
 471               dump_printf_loc (MSG_NOTE, vect_location,
 472                                "get vectype for scalar type:  ");
 473               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 474               dump_printf (MSG_NOTE, "\n");
 475             }
 476           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 477           if (!vf_vectype)
 478             {
 479               if (dump_enabled_p ())
 480                 {
 481                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 482                                    "not vectorized: unsupported data-type ");
 483                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 484                                      scalar_type);
 485                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 486                 }
 487               return false;
 488             }
 489
 490           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 491                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 492             {
 493               if (dump_enabled_p ())
 494                 {
 495                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 496                                    "not vectorized: different sized vector "
 497                                    "types in statement, ");
 498                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 499                                      vectype);
 500                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 501                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 502                                      vf_vectype);
 503                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 504                 }
 505               return false;
 506             }
 507
 508           if (dump_enabled_p ())
 509             {
 510               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 511               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 512               dump_printf (MSG_NOTE, "\n");
 513             }
 514
 515           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 516           if (dump_enabled_p ())
 517             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 518           if (!vectorization_factor
 519               || (nunits > vectorization_factor))
 520             vectorization_factor = nunits;
 521
 522           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 523             {
 524               pattern_def_seq = NULL;
 525               gsi_next (&si);
 526             }
 527         }
 528     }
 529
 530   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 531   if (dump_enabled_p ())
 532     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 533                      vectorization_factor);
 534   if (vectorization_factor <= 1)
 535     {
 536       if (dump_enabled_p ())
 537         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 538                          "not vectorized: unsupported data-type\n");
 539       return false;
 540     }
 541   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 542
 543   return true;
 544 }
 545
 546
 547 /* Function vect_is_simple_iv_evolution.
 548
 549    FORNOW: A simple evolution of an induction variables in the loop is
 550    considered a polynomial evolution.  */
 551
 552 static bool
 553 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 554                              tree * step)
 555 {
 556   tree init_expr;
 557   tree step_expr;
 558   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 559   basic_block bb;
 560
 561   /* When there is no evolution in this loop, the evolution function
 562      is not "simple".  */
 563   if (evolution_part == NULL_TREE)
 564     return false;
 565
 566   /* When the evolution is a polynomial of degree >= 2
 567      the evolution function is not "simple".  */
 568   if (tree_is_chrec (evolution_part))
 569     return false;
 570
 571   step_expr = evolution_part;
 572   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 573
 574   if (dump_enabled_p ())
 575     {
 576       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 577       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 578       dump_printf (MSG_NOTE, ",  init: ");
 579       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 580       dump_printf (MSG_NOTE, "\n");
 581     }
 582
 583   *init = init_expr;
 584   *step = step_expr;
 585
 586   if (TREE_CODE (step_expr) != INTEGER_CST
 587       && (TREE_CODE (step_expr) != SSA_NAME
 588           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 589               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 590           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 591               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 592                   || !flag_associative_math)))
 593       && (TREE_CODE (step_expr) != REAL_CST
 594           || !flag_associative_math))
 595     {
 596       if (dump_enabled_p ())
 597         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 598                          "step unknown.\n");
 599       return false;
 600     }
 601
 602   return true;
 603 }
 604
 605 /* Function vect_analyze_scalar_cycles_1.
 606
 607    Examine the cross iteration def-use cycles of scalar variables
 608    in LOOP.  LOOP_VINFO represents the loop that is now being
 609    considered for vectorization (can be LOOP, or an outer-loop
 610    enclosing LOOP).  */
 611
 612 static void
 613 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 614 {
 615   basic_block bb = loop->header;
 616   tree init, step;
 617   auto_vec<gimple *, 64> worklist;
 618   gphi_iterator gsi;
 619   bool double_reduc;
 620
 621   if (dump_enabled_p ())
 622     dump_printf_loc (MSG_NOTE, vect_location,
 623                      "=== vect_analyze_scalar_cycles ===\n");
 624
 625   /* First - identify all inductions.  Reduction detection assumes that all the
 626      inductions have been identified, therefore, this order must not be
 627      changed.  */
 628   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 629     {
 630       gphi *phi = gsi.phi ();
 631       tree access_fn = NULL;
 632       tree def = PHI_RESULT (phi);
 633       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 634
 635       if (dump_enabled_p ())
 636         {
 637           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 638           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 639           dump_printf (MSG_NOTE, "\n");
 640         }
 641
 642       /* Skip virtual phi's.  The data dependences that are associated with
 643          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 644       if (virtual_operand_p (def))
 645         continue;
 646
 647       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 648
 649       /* Analyze the evolution function.  */
 650       access_fn = analyze_scalar_evolution (loop, def);
 651       if (access_fn)
 652         {
 653           STRIP_NOPS (access_fn);
 654           if (dump_enabled_p ())
 655             {
 656               dump_printf_loc (MSG_NOTE, vect_location,
 657                                "Access function of PHI: ");
 658               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 659               dump_printf (MSG_NOTE, "\n");
 660             }
 661           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 662             = evolution_part_in_loop_num (access_fn, loop->num);
 663         }
 664
 665       if (!access_fn
 666           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 667           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 668               && TREE_CODE (step) != INTEGER_CST))
 669         {
 670           worklist.safe_push (phi);
 671           continue;
 672         }
 673
 674       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 675
 676       if (dump_enabled_p ())
 677         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 678       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 679     }
 680
 681
 682   /* Second - identify all reductions and nested cycles.  */
 683   while (worklist.length () > 0)
 684     {
 685       gimple *phi = worklist.pop ();
 686       tree def = PHI_RESULT (phi);
 687       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 688       gimple *reduc_stmt;
 689       bool nested_cycle;
 690
 691       if (dump_enabled_p ())
 692         {
 693           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 694           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 695           dump_printf (MSG_NOTE, "\n");
 696         }
 697
 698       gcc_assert (!virtual_operand_p (def)
 699                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 700
 701       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 702       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 703                                                 &double_reduc, false);
 704       if (reduc_stmt)
 705         {
 706           if (double_reduc)
 707             {
 708               if (dump_enabled_p ())
 709                 dump_printf_loc (MSG_NOTE, vect_location,
 710                                  "Detected double reduction.\n");
 711
 712               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 713               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 714                                                     vect_double_reduction_def;
 715             }
 716           else
 717             {
 718               if (nested_cycle)
 719                 {
 720                   if (dump_enabled_p ())
 721                     dump_printf_loc (MSG_NOTE, vect_location,
 722                                      "Detected vectorizable nested cycle.\n");
 723
 724                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 725                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 726                                                              vect_nested_cycle;
 727                 }
 728               else
 729                 {
 730                   if (dump_enabled_p ())
 731                     dump_printf_loc (MSG_NOTE, vect_location,
 732                                      "Detected reduction.\n");
 733
 734                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 735                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 736                                                            vect_reduction_def;
 737                   /* Store the reduction cycles for possible vectorization in
 738                      loop-aware SLP.  */
 739                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 740                 }
 741             }
 742         }
 743       else
 744         if (dump_enabled_p ())
 745           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 746                            "Unknown def-use cycle pattern.\n");
 747     }
 748 }
 749
 750
 751 /* Function vect_analyze_scalar_cycles.
 752
 753    Examine the cross iteration def-use cycles of scalar variables, by
 754    analyzing the loop-header PHIs of scalar variables.  Classify each
 755    cycle as one of the following: invariant, induction, reduction, unknown.
 756    We do that for the loop represented by LOOP_VINFO, and also to its
 757    inner-loop, if exists.
 758    Examples for scalar cycles:
 759
 760    Example1: reduction:
 761
 762               loop1:
 763               for (i=0; i<N; i++)
 764                  sum += a[i];
 765
 766    Example2: induction:
 767
 768               loop2:
 769               for (i=0; i<N; i++)
 770                  a[i] = i;  */
 771
 772 static void
 773 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 774 {
 775   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 776
 777   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 778
 779   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 780      Reductions in such inner-loop therefore have different properties than
 781      the reductions in the nest that gets vectorized:
 782      1. When vectorized, they are executed in the same order as in the original
 783         scalar loop, so we can't change the order of computation when
 784         vectorizing them.
 785      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 786         current checks are too strict.  */
 787
 788   if (loop->inner)
 789     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 790 }
 791
 792 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 793
 794 static void
 795 vect_fixup_reduc_chain (gimple *stmt)
 796 {
 797   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 798   gimple *stmtp;
 799   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 800               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 801   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 802   do
 803     {
 804       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 805       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 806       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 807       if (stmt)
 808         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 809           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 810     }
 811   while (stmt);
 812   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 813 }
 814
 815 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 816
 817 static void
 818 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 819 {
 820   gimple *first;
 821   unsigned i;
 822
 823   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 824     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 825       {
 826         vect_fixup_reduc_chain (first);
 827         LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 828           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 829       }
 830 }
 831
 832 /* Function vect_get_loop_niters.
 833
 834    Determine how many iterations the loop is executed and place it
 835    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 836    in NUMBER_OF_ITERATIONSM1.
 837
 838    Return the loop exit condition.  */
 839
 840
 841 static gcond *
 842 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
 843                       tree *number_of_iterationsm1)
 844 {
 845   tree niters;
 846
 847   if (dump_enabled_p ())
 848     dump_printf_loc (MSG_NOTE, vect_location,
 849                      "=== get_loop_niters ===\n");
 850
 851   niters = number_of_latch_executions (loop);
 852   *number_of_iterationsm1 = niters;
 853
 854   /* We want the number of loop header executions which is the number
 855      of latch executions plus one.
 856      ???  For UINT_MAX latch executions this number overflows to zero
 857      for loops like do { n++; } while (n != 0);  */
 858   if (niters && !chrec_contains_undetermined (niters))
 859     niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
 860                           build_int_cst (TREE_TYPE (niters), 1));
 861   *number_of_iterations = niters;
 862
 863   return get_loop_exit_condition (loop);
 864 }
 865
 866
 867 /* Function bb_in_loop_p
 868
 869    Used as predicate for dfs order traversal of the loop bbs.  */
 870
 871 static bool
 872 bb_in_loop_p (const_basic_block bb, const void *data)
 873 {
 874   const struct loop *const loop = (const struct loop *)data;
 875   if (flow_bb_inside_loop_p (loop, bb))
 876     return true;
 877   return false;
 878 }
 879
 880
 881 /* Function new_loop_vec_info.
 882
 883    Create and initialize a new loop_vec_info struct for LOOP, as well as
 884    stmt_vec_info structs for all the stmts in LOOP.  */
 885
 886 static loop_vec_info
 887 new_loop_vec_info (struct loop *loop)
 888 {
 889   loop_vec_info res;
 890   basic_block *bbs;
 891   gimple_stmt_iterator si;
 892   unsigned int i, nbbs;
 893
 894   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 895   res->kind = vec_info::loop;
 896   LOOP_VINFO_LOOP (res) = loop;
 897
 898   bbs = get_loop_body (loop);
 899
 900   /* Create/Update stmt_info for all stmts in the loop.  */
 901   for (i = 0; i < loop->num_nodes; i++)
 902     {
 903       basic_block bb = bbs[i];
 904
 905       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 906         {
 907           gimple *phi = gsi_stmt (si);
 908           gimple_set_uid (phi, 0);
 909           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res));
 910         }
 911
 912       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 913         {
 914           gimple *stmt = gsi_stmt (si);
 915           gimple_set_uid (stmt, 0);
 916           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res));
 917         }
 918     }
 919
 920   /* CHECKME: We want to visit all BBs before their successors (except for
 921      latch blocks, for which this assertion wouldn't hold).  In the simple
 922      case of the loop forms we allow, a dfs order of the BBs would the same
 923      as reversed postorder traversal, so we are safe.  */
 924
 925    free (bbs);
 926    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 927    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 928                               bbs, loop->num_nodes, loop);
 929    gcc_assert (nbbs == loop->num_nodes);
 930
 931   LOOP_VINFO_BBS (res) = bbs;
 932   LOOP_VINFO_NITERSM1 (res) = NULL;
 933   LOOP_VINFO_NITERS (res) = NULL;
 934   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 935   LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
 936   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 937   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
 938   LOOP_VINFO_VECT_FACTOR (res) = 0;
 939   LOOP_VINFO_LOOP_NEST (res) = vNULL;
 940   LOOP_VINFO_DATAREFS (res) = vNULL;
 941   LOOP_VINFO_DDRS (res) = vNULL;
 942   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 943   LOOP_VINFO_MAY_MISALIGN_STMTS (res) = vNULL;
 944   LOOP_VINFO_MAY_ALIAS_DDRS (res) = vNULL;
 945   LOOP_VINFO_GROUPED_STORES (res) = vNULL;
 946   LOOP_VINFO_REDUCTIONS (res) = vNULL;
 947   LOOP_VINFO_REDUCTION_CHAINS (res) = vNULL;
 948   LOOP_VINFO_SLP_INSTANCES (res) = vNULL;
 949   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 950   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
 951   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 952   LOOP_VINFO_PEELING_FOR_NITER (res) = false;
 953   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
 954
 955   return res;
 956 }
 957
 958
 959 /* Function destroy_loop_vec_info.
 960
 961    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 962    stmts in the loop.  */
 963
 964 void
 965 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 966 {
 967   struct loop *loop;
 968   basic_block *bbs;
 969   int nbbs;
 970   gimple_stmt_iterator si;
 971   int j;
 972   vec<slp_instance> slp_instances;
 973   slp_instance instance;
 974   bool swapped;
 975
 976   if (!loop_vinfo)
 977     return;
 978
 979   loop = LOOP_VINFO_LOOP (loop_vinfo);
 980
 981   bbs = LOOP_VINFO_BBS (loop_vinfo);
 982   nbbs = clean_stmts ? loop->num_nodes : 0;
 983   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
 984
 985   for (j = 0; j < nbbs; j++)
 986     {
 987       basic_block bb = bbs[j];
 988       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 989         free_stmt_vec_info (gsi_stmt (si));
 990
 991       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 992         {
 993           gimple *stmt = gsi_stmt (si);
 994
 995           /* We may have broken canonical form by moving a constant
 996              into RHS1 of a commutative op.  Fix such occurrences.  */
 997           if (swapped && is_gimple_assign (stmt))
 998             {
 999               enum tree_code code = gimple_assign_rhs_code (stmt);
1000
1001               if ((code == PLUS_EXPR
1002                    || code == POINTER_PLUS_EXPR
1003                    || code == MULT_EXPR)
1004                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1005                 swap_ssa_operands (stmt,
1006                                    gimple_assign_rhs1_ptr (stmt),
1007                                    gimple_assign_rhs2_ptr (stmt));
1008             }
1009
1010           /* Free stmt_vec_info.  */
1011           free_stmt_vec_info (stmt);
1012           gsi_next (&si);
1013         }
1014     }
1015
1016   free (LOOP_VINFO_BBS (loop_vinfo));
1017   vect_destroy_datarefs (loop_vinfo);
1018   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1019   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1020   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1021   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1022   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1023   FOR_EACH_VEC_ELT (slp_instances, j, instance)
1024     vect_free_slp_instance (instance);
1025
1026   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1027   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1028   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1029   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1030
1031   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1032   loop_vinfo->scalar_cost_vec.release ();
1033
1034   free (loop_vinfo);
1035   loop->aux = NULL;
1036 }
1037
1038
1039 /* Calculate the cost of one scalar iteration of the loop.  */
1040 static void
1041 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1042 {
1043   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1044   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1045   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1046   int innerloop_iters, i;
1047
1048   /* Count statements in scalar loop.  Using this as scalar cost for a single
1049      iteration for now.
1050
1051      TODO: Add outer loop support.
1052
1053      TODO: Consider assigning different costs to different scalar
1054      statements.  */
1055
1056   /* FORNOW.  */
1057   innerloop_iters = 1;
1058   if (loop->inner)
1059     innerloop_iters = 50; /* FIXME */
1060
1061   for (i = 0; i < nbbs; i++)
1062     {
1063       gimple_stmt_iterator si;
1064       basic_block bb = bbs[i];
1065
1066       if (bb->loop_father == loop->inner)
1067         factor = innerloop_iters;
1068       else
1069         factor = 1;
1070
1071       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1072         {
1073           gimple *stmt = gsi_stmt (si);
1074           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1075
1076           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1077             continue;
1078
1079           /* Skip stmts that are not vectorized inside the loop.  */
1080           if (stmt_info
1081               && !STMT_VINFO_RELEVANT_P (stmt_info)
1082               && (!STMT_VINFO_LIVE_P (stmt_info)
1083                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1084               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1085             continue;
1086
1087           vect_cost_for_stmt kind;
1088           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
1089             {
1090               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1091                kind = scalar_load;
1092              else
1093                kind = scalar_store;
1094             }
1095           else
1096             kind = scalar_stmt;
1097
1098           scalar_single_iter_cost
1099             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1100                                  factor, kind, NULL, 0, vect_prologue);
1101         }
1102     }
1103   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1104     = scalar_single_iter_cost;
1105 }
1106
1107
1108 /* Function vect_analyze_loop_form_1.
1109
1110    Verify that certain CFG restrictions hold, including:
1111    - the loop has a pre-header
1112    - the loop has a single entry and exit
1113    - the loop exit condition is simple enough, and the number of iterations
1114      can be analyzed (a countable loop).  */
1115
1116 bool
1117 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1118                           tree *number_of_iterationsm1,
1119                           tree *number_of_iterations, gcond **inner_loop_cond)
1120 {
1121   if (dump_enabled_p ())
1122     dump_printf_loc (MSG_NOTE, vect_location,
1123                      "=== vect_analyze_loop_form ===\n");
1124
1125   /* Different restrictions apply when we are considering an inner-most loop,
1126      vs. an outer (nested) loop.
1127      (FORNOW. May want to relax some of these restrictions in the future).  */
1128
1129   if (!loop->inner)
1130     {
1131       /* Inner-most loop.  We currently require that the number of BBs is
1132          exactly 2 (the header and latch).  Vectorizable inner-most loops
1133          look like this:
1134
1135                         (pre-header)
1136                            |
1137                           header <--------+
1138                            | |            |
1139                            | +--> latch --+
1140                            |
1141                         (exit-bb)  */
1142
1143       if (loop->num_nodes != 2)
1144         {
1145           if (dump_enabled_p ())
1146             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1147                              "not vectorized: control flow in loop.\n");
1148           return false;
1149         }
1150
1151       if (empty_block_p (loop->header))
1152         {
1153           if (dump_enabled_p ())
1154             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1155                              "not vectorized: empty loop.\n");
1156           return false;
1157         }
1158     }
1159   else
1160     {
1161       struct loop *innerloop = loop->inner;
1162       edge entryedge;
1163
1164       /* Nested loop. We currently require that the loop is doubly-nested,
1165          contains a single inner loop, and the number of BBs is exactly 5.
1166          Vectorizable outer-loops look like this:
1167
1168                         (pre-header)
1169                            |
1170                           header <---+
1171                            |         |
1172                           inner-loop |
1173                            |         |
1174                           tail ------+
1175                            |
1176                         (exit-bb)
1177
1178          The inner-loop has the properties expected of inner-most loops
1179          as described above.  */
1180
1181       if ((loop->inner)->inner || (loop->inner)->next)
1182         {
1183           if (dump_enabled_p ())
1184             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1185                              "not vectorized: multiple nested loops.\n");
1186           return false;
1187         }
1188
1189       if (loop->num_nodes != 5)
1190         {
1191           if (dump_enabled_p ())
1192             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1193                              "not vectorized: control flow in loop.\n");
1194           return false;
1195         }
1196
1197       entryedge = loop_preheader_edge (innerloop);
1198       if (entryedge->src != loop->header
1199           || !single_exit (innerloop)
1200           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1201         {
1202           if (dump_enabled_p ())
1203             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1204                              "not vectorized: unsupported outerloop form.\n");
1205           return false;
1206         }
1207
1208       /* Analyze the inner-loop.  */
1209       tree inner_niterm1, inner_niter;
1210       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1211                                       &inner_niterm1, &inner_niter, NULL))
1212         {
1213           if (dump_enabled_p ())
1214             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1215                              "not vectorized: Bad inner loop.\n");
1216           return false;
1217         }
1218
1219       if (!expr_invariant_in_loop_p (loop, inner_niter))
1220         {
1221           if (dump_enabled_p ())
1222             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1223                              "not vectorized: inner-loop count not"
1224                              " invariant.\n");
1225           return false;
1226         }
1227
1228       if (dump_enabled_p ())
1229         dump_printf_loc (MSG_NOTE, vect_location,
1230                          "Considering outer-loop vectorization.\n");
1231     }
1232
1233   if (!single_exit (loop)
1234       || EDGE_COUNT (loop->header->preds) != 2)
1235     {
1236       if (dump_enabled_p ())
1237         {
1238           if (!single_exit (loop))
1239             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1240                              "not vectorized: multiple exits.\n");
1241           else if (EDGE_COUNT (loop->header->preds) != 2)
1242             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1243                              "not vectorized: too many incoming edges.\n");
1244         }
1245       return false;
1246     }
1247
1248   /* We assume that the loop exit condition is at the end of the loop. i.e,
1249      that the loop is represented as a do-while (with a proper if-guard
1250      before the loop if needed), where the loop header contains all the
1251      executable statements, and the latch is empty.  */
1252   if (!empty_block_p (loop->latch)
1253       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1254     {
1255       if (dump_enabled_p ())
1256         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1257                          "not vectorized: latch block not empty.\n");
1258       return false;
1259     }
1260
1261   /* Make sure there exists a single-predecessor exit bb:  */
1262   if (!single_pred_p (single_exit (loop)->dest))
1263     {
1264       edge e = single_exit (loop);
1265       if (!(e->flags & EDGE_ABNORMAL))
1266         {
1267           split_loop_exit_edge (e);
1268           if (dump_enabled_p ())
1269             dump_printf (MSG_NOTE, "split exit edge.\n");
1270         }
1271       else
1272         {
1273           if (dump_enabled_p ())
1274             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1275                              "not vectorized: abnormal loop exit edge.\n");
1276           return false;
1277         }
1278     }
1279
1280   *loop_cond = vect_get_loop_niters (loop, number_of_iterations,
1281                                      number_of_iterationsm1);
1282   if (!*loop_cond)
1283     {
1284       if (dump_enabled_p ())
1285         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1286                          "not vectorized: complicated exit condition.\n");
1287       return false;
1288     }
1289
1290   if (!*number_of_iterations
1291       || chrec_contains_undetermined (*number_of_iterations))
1292     {
1293       if (dump_enabled_p ())
1294         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1295                          "not vectorized: number of iterations cannot be "
1296                          "computed.\n");
1297       return false;
1298     }
1299
1300   if (integer_zerop (*number_of_iterations))
1301     {
1302       if (dump_enabled_p ())
1303         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1304                          "not vectorized: number of iterations = 0.\n");
1305       return false;
1306     }
1307
1308   return true;
1309 }
1310
1311 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1312
1313 loop_vec_info
1314 vect_analyze_loop_form (struct loop *loop)
1315 {
1316   tree number_of_iterations, number_of_iterationsm1;
1317   gcond *loop_cond, *inner_loop_cond = NULL;
1318
1319   if (! vect_analyze_loop_form_1 (loop, &loop_cond, &number_of_iterationsm1,
1320                                   &number_of_iterations, &inner_loop_cond))
1321     return NULL;
1322
1323   loop_vec_info loop_vinfo = new_loop_vec_info (loop);
1324   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1325   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1326   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1327
1328   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1329     {
1330       if (dump_enabled_p ())
1331         {
1332           dump_printf_loc (MSG_NOTE, vect_location,
1333                            "Symbolic number of iterations is ");
1334           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1335           dump_printf (MSG_NOTE, "\n");
1336         }
1337     }
1338
1339   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1340   if (inner_loop_cond)
1341     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1342       = loop_exit_ctrl_vec_info_type;
1343
1344   gcc_assert (!loop->aux);
1345   loop->aux = loop_vinfo;
1346   return loop_vinfo;
1347 }
1348
1349
1350
1351 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1352    statements update the vectorization factor.  */
1353
1354 static void
1355 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1356 {
1357   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1358   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1359   int nbbs = loop->num_nodes;
1360   unsigned int vectorization_factor;
1361   int i;
1362
1363   if (dump_enabled_p ())
1364     dump_printf_loc (MSG_NOTE, vect_location,
1365                      "=== vect_update_vf_for_slp ===\n");
1366
1367   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1368   gcc_assert (vectorization_factor != 0);
1369
1370   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1371      vectorization factor of the loop is the unrolling factor required by
1372      the SLP instances.  If that unrolling factor is 1, we say, that we
1373      perform pure SLP on loop - cross iteration parallelism is not
1374      exploited.  */
1375   bool only_slp_in_loop = true;
1376   for (i = 0; i < nbbs; i++)
1377     {
1378       basic_block bb = bbs[i];
1379       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1380            gsi_next (&si))
1381         {
1382           gimple *stmt = gsi_stmt (si);
1383           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1384           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1385               && STMT_VINFO_RELATED_STMT (stmt_info))
1386             {
1387               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1388               stmt_info = vinfo_for_stmt (stmt);
1389             }
1390           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1391                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1392               && !PURE_SLP_STMT (stmt_info))
1393             /* STMT needs both SLP and loop-based vectorization.  */
1394             only_slp_in_loop = false;
1395         }
1396     }
1397
1398   if (only_slp_in_loop)
1399     vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1400   else
1401     vectorization_factor
1402       = least_common_multiple (vectorization_factor,
1403                                LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1404
1405   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1406   if (dump_enabled_p ())
1407     dump_printf_loc (MSG_NOTE, vect_location,
1408                      "Updating vectorization factor to %d\n",
1409                      vectorization_factor);
1410 }
1411
1412 /* Function vect_analyze_loop_operations.
1413
1414    Scan the loop stmts and make sure they are all vectorizable.  */
1415
1416 static bool
1417 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1418 {
1419   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1420   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1421   int nbbs = loop->num_nodes;
1422   int i;
1423   stmt_vec_info stmt_info;
1424   bool need_to_vectorize = false;
1425   bool ok;
1426
1427   if (dump_enabled_p ())
1428     dump_printf_loc (MSG_NOTE, vect_location,
1429                      "=== vect_analyze_loop_operations ===\n");
1430
1431   for (i = 0; i < nbbs; i++)
1432     {
1433       basic_block bb = bbs[i];
1434
1435       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1436            gsi_next (&si))
1437         {
1438           gphi *phi = si.phi ();
1439           ok = true;
1440
1441           stmt_info = vinfo_for_stmt (phi);
1442           if (dump_enabled_p ())
1443             {
1444               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1445               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1446               dump_printf (MSG_NOTE, "\n");
1447             }
1448           if (virtual_operand_p (gimple_phi_result (phi)))
1449             continue;
1450
1451           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1452              (i.e., a phi in the tail of the outer-loop).  */
1453           if (! is_loop_header_bb_p (bb))
1454             {
1455               /* FORNOW: we currently don't support the case that these phis
1456                  are not used in the outerloop (unless it is double reduction,
1457                  i.e., this phi is vect_reduction_def), cause this case
1458                  requires to actually do something here.  */
1459               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1460                    || STMT_VINFO_LIVE_P (stmt_info))
1461                   && STMT_VINFO_DEF_TYPE (stmt_info)
1462                      != vect_double_reduction_def)
1463                 {
1464                   if (dump_enabled_p ())
1465                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1466                                      "Unsupported loop-closed phi in "
1467                                      "outer-loop.\n");
1468                   return false;
1469                 }
1470
1471               /* If PHI is used in the outer loop, we check that its operand
1472                  is defined in the inner loop.  */
1473               if (STMT_VINFO_RELEVANT_P (stmt_info))
1474                 {
1475                   tree phi_op;
1476                   gimple *op_def_stmt;
1477
1478                   if (gimple_phi_num_args (phi) != 1)
1479                     return false;
1480
1481                   phi_op = PHI_ARG_DEF (phi, 0);
1482                   if (TREE_CODE (phi_op) != SSA_NAME)
1483                     return false;
1484
1485                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1486                   if (gimple_nop_p (op_def_stmt)
1487                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1488                       || !vinfo_for_stmt (op_def_stmt))
1489                     return false;
1490
1491                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1492                         != vect_used_in_outer
1493                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1494                            != vect_used_in_outer_by_reduction)
1495                     return false;
1496                 }
1497
1498               continue;
1499             }
1500
1501           gcc_assert (stmt_info);
1502
1503           if (STMT_VINFO_LIVE_P (stmt_info))
1504             {
1505               /* FORNOW: not yet supported.  */
1506               if (dump_enabled_p ())
1507                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1508                                  "not vectorized: value used after loop.\n");
1509               return false;
1510             }
1511
1512           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1513               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1514             {
1515               /* A scalar-dependence cycle that we don't support.  */
1516               if (dump_enabled_p ())
1517                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1518                                  "not vectorized: scalar dependence cycle.\n");
1519               return false;
1520             }
1521
1522           if (STMT_VINFO_RELEVANT_P (stmt_info))
1523             {
1524               need_to_vectorize = true;
1525               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1526                 ok = vectorizable_induction (phi, NULL, NULL);
1527             }
1528
1529           if (!ok)
1530             {
1531               if (dump_enabled_p ())
1532                 {
1533                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1534                                    "not vectorized: relevant phi not "
1535                                    "supported: ");
1536                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1537                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1538                 }
1539               return false;
1540             }
1541         }
1542
1543       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1544            gsi_next (&si))
1545         {
1546           gimple *stmt = gsi_stmt (si);
1547           if (!gimple_clobber_p (stmt)
1548               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1549             return false;
1550         }
1551     } /* bbs */
1552
1553   /* All operations in the loop are either irrelevant (deal with loop
1554      control, or dead), or only used outside the loop and can be moved
1555      out of the loop (e.g. invariants, inductions).  The loop can be
1556      optimized away by scalar optimizations.  We're better off not
1557      touching this loop.  */
1558   if (!need_to_vectorize)
1559     {
1560       if (dump_enabled_p ())
1561         dump_printf_loc (MSG_NOTE, vect_location,
1562                          "All the computation can be taken out of the loop.\n");
1563       if (dump_enabled_p ())
1564         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1565                          "not vectorized: redundant loop. no profit to "
1566                          "vectorize.\n");
1567       return false;
1568     }
1569
1570   return true;
1571 }
1572
1573
1574 /* Function vect_analyze_loop_2.
1575
1576    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1577    for it.  The different analyses will record information in the
1578    loop_vec_info struct.  */
1579 static bool
1580 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1581 {
1582   bool ok;
1583   int max_vf = MAX_VECTORIZATION_FACTOR;
1584   int min_vf = 2;
1585   unsigned int n_stmts = 0;
1586
1587   /* Find all data references in the loop (which correspond to vdefs/vuses)
1588      and analyze their evolution in the loop.  */
1589
1590   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1591
1592   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1593   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1594     {
1595       if (dump_enabled_p ())
1596         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1597                          "not vectorized: loop contains function calls"
1598                          " or data references that cannot be analyzed\n");
1599       return false;
1600     }
1601
1602   for (unsigned i = 0; i < loop->num_nodes; i++)
1603     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1604          !gsi_end_p (gsi); gsi_next (&gsi))
1605       {
1606         gimple *stmt = gsi_stmt (gsi);
1607         if (is_gimple_debug (stmt))
1608           continue;
1609         ++n_stmts;
1610         if (!find_data_references_in_stmt (loop, stmt,
1611                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
1612           {
1613             if (is_gimple_call (stmt) && loop->safelen)
1614               {
1615                 tree fndecl = gimple_call_fndecl (stmt), op;
1616                 if (fndecl != NULL_TREE)
1617                   {
1618                     cgraph_node *node = cgraph_node::get (fndecl);
1619                     if (node != NULL && node->simd_clones != NULL)
1620                       {
1621                         unsigned int j, n = gimple_call_num_args (stmt);
1622                         for (j = 0; j < n; j++)
1623                           {
1624                             op = gimple_call_arg (stmt, j);
1625                             if (DECL_P (op)
1626                                 || (REFERENCE_CLASS_P (op)
1627                                     && get_base_address (op)))
1628                               break;
1629                           }
1630                         op = gimple_call_lhs (stmt);
1631                         /* Ignore #pragma omp declare simd functions
1632                            if they don't have data references in the
1633                            call stmt itself.  */
1634                         if (j == n
1635                             && !(op
1636                                  && (DECL_P (op)
1637                                      || (REFERENCE_CLASS_P (op)
1638                                          && get_base_address (op)))))
1639                           continue;
1640                       }
1641                   }
1642               }
1643             if (dump_enabled_p ())
1644               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1645                                "not vectorized: loop contains function "
1646                                "calls or data references that cannot "
1647                                "be analyzed\n");
1648             return false;
1649           }
1650       }
1651
1652   /* Analyze the data references and also adjust the minimal
1653      vectorization factor according to the loads and stores.  */
1654
1655   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1656   if (!ok)
1657     {
1658       if (dump_enabled_p ())
1659         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1660                          "bad data references.\n");
1661       return false;
1662     }
1663
1664   /* Classify all cross-iteration scalar data-flow cycles.
1665      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1666
1667   vect_analyze_scalar_cycles (loop_vinfo);
1668
1669   vect_pattern_recog (loop_vinfo);
1670
1671   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1672
1673   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1674      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1675
1676   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1677   if (!ok)
1678     {
1679       if (dump_enabled_p ())
1680         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1681                          "bad data access.\n");
1682       return false;
1683     }
1684
1685   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1686
1687   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1688   if (!ok)
1689     {
1690       if (dump_enabled_p ())
1691         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1692                          "unexpected pattern.\n");
1693       return false;
1694     }
1695
1696   /* Analyze data dependences between the data-refs in the loop
1697      and adjust the maximum vectorization factor according to
1698      the dependences.
1699      FORNOW: fail at the first data dependence that we encounter.  */
1700
1701   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1702   if (!ok
1703       || max_vf < min_vf)
1704     {
1705       if (dump_enabled_p ())
1706             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1707                              "bad data dependence.\n");
1708       return false;
1709     }
1710
1711   ok = vect_determine_vectorization_factor (loop_vinfo);
1712   if (!ok)
1713     {
1714       if (dump_enabled_p ())
1715         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1716                          "can't determine vectorization factor.\n");
1717       return false;
1718     }
1719   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1720     {
1721       if (dump_enabled_p ())
1722         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1723                          "bad data dependence.\n");
1724       return false;
1725     }
1726
1727   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1728   ok = vect_analyze_slp (loop_vinfo, n_stmts);
1729   if (!ok)
1730     return false;
1731
1732   /* If there are any SLP instances mark them as pure_slp.  */
1733   bool slp = vect_make_slp_decision (loop_vinfo);
1734   if (slp)
1735     {
1736       /* Find stmts that need to be both vectorized and SLPed.  */
1737       vect_detect_hybrid_slp (loop_vinfo);
1738
1739       /* Update the vectorization factor based on the SLP decision.  */
1740       vect_update_vf_for_slp (loop_vinfo);
1741     }
1742
1743   /* Now the vectorization factor is final.  */
1744   unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1745   gcc_assert (vectorization_factor != 0);
1746
1747   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1748     dump_printf_loc (MSG_NOTE, vect_location,
1749                      "vectorization_factor = %d, niters = "
1750                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1751                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1752
1753   HOST_WIDE_INT max_niter
1754     = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1755   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1756        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1757       || (max_niter != -1
1758           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1759     {
1760       if (dump_enabled_p ())
1761         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1762                          "not vectorized: iteration count too small.\n");
1763       if (dump_enabled_p ())
1764         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1765                          "not vectorized: iteration count smaller than "
1766                          "vectorization factor.\n");
1767       return false;
1768     }
1769
1770   /* Analyze the alignment of the data-refs in the loop.
1771      Fail if a data reference is found that cannot be vectorized.  */
1772
1773   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1774   if (!ok)
1775     {
1776       if (dump_enabled_p ())
1777         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1778                          "bad data alignment.\n");
1779       return false;
1780     }
1781
1782   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1783      It is important to call pruning after vect_analyze_data_ref_accesses,
1784      since we use grouping information gathered by interleaving analysis.  */
1785   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1786   if (!ok)
1787     {
1788       if (dump_enabled_p ())
1789         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1790                          "number of versioning for alias "
1791                          "run-time tests exceeds %d "
1792                          "(--param vect-max-version-for-alias-checks)\n",
1793                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1794       return false;
1795     }
1796
1797   /* Compute the scalar iteration cost.  */
1798   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1799
1800   /* This pass will decide on using loop versioning and/or loop peeling in
1801      order to enhance the alignment of data references in the loop.  */
1802
1803   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1804   if (!ok)
1805     {
1806       if (dump_enabled_p ())
1807         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1808                          "bad data alignment.\n");
1809       return false;
1810     }
1811
1812   if (slp)
1813     {
1814       /* Analyze operations in the SLP instances.  Note this may
1815          remove unsupported SLP instances which makes the above
1816          SLP kind detection invalid.  */
1817       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1818       vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
1819                                    LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1820       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1821         return false;
1822     }
1823
1824   /* Scan all the remaining operations in the loop that are not subject
1825      to SLP and make sure they are vectorizable.  */
1826   ok = vect_analyze_loop_operations (loop_vinfo);
1827   if (!ok)
1828     {
1829       if (dump_enabled_p ())
1830         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1831                          "bad operation or unsupported loop bound.\n");
1832       return false;
1833     }
1834
1835   /* Analyze cost.  Decide if worth while to vectorize.  */
1836   int min_profitable_estimate, min_profitable_iters;
1837   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1838                                       &min_profitable_estimate);
1839
1840   if (min_profitable_iters < 0)
1841     {
1842       if (dump_enabled_p ())
1843         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1844                          "not vectorized: vectorization not profitable.\n");
1845       if (dump_enabled_p ())
1846         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1847                          "not vectorized: vector version will never be "
1848                          "profitable.\n");
1849       return false;
1850     }
1851
1852   int min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1853                                 * vectorization_factor) - 1);
1854
1855   /* Use the cost model only if it is more conservative than user specified
1856      threshold.  */
1857   unsigned th = (unsigned) min_scalar_loop_bound;
1858   if (min_profitable_iters
1859       && (!min_scalar_loop_bound
1860           || min_profitable_iters > min_scalar_loop_bound))
1861     th = (unsigned) min_profitable_iters;
1862
1863   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1864
1865   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1866       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1867     {
1868       if (dump_enabled_p ())
1869         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1870                          "not vectorized: vectorization not profitable.\n");
1871       if (dump_enabled_p ())
1872         dump_printf_loc (MSG_NOTE, vect_location,
1873                          "not vectorized: iteration count smaller than user "
1874                          "specified loop bound parameter or minimum profitable "
1875                          "iterations (whichever is more conservative).\n");
1876       return false;
1877     }
1878
1879   HOST_WIDE_INT estimated_niter
1880     = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1881   if (estimated_niter != -1
1882       && ((unsigned HOST_WIDE_INT) estimated_niter
1883           <= MAX (th, (unsigned)min_profitable_estimate)))
1884     {
1885       if (dump_enabled_p ())
1886         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1887                          "not vectorized: estimated iteration count too "
1888                          "small.\n");
1889       if (dump_enabled_p ())
1890         dump_printf_loc (MSG_NOTE, vect_location,
1891                          "not vectorized: estimated iteration count smaller "
1892                          "than specified loop bound parameter or minimum "
1893                          "profitable iterations (whichever is more "
1894                          "conservative).\n");
1895       return false;
1896     }
1897
1898   /* Decide whether we need to create an epilogue loop to handle
1899      remaining scalar iterations.  */
1900   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1901         / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1902        * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1903
1904   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1905       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1906     {
1907       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1908                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1909           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1910         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1911     }
1912   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1913            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1914                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1915                /* In case of versioning, check if the maximum number of
1916                   iterations is greater than th.  If they are identical,
1917                   the epilogue is unnecessary.  */
1918                && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1919                     && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1920                    || (unsigned HOST_WIDE_INT) max_niter > th)))
1921     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1922
1923   /* If an epilogue loop is required make sure we can create one.  */
1924   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1925       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1926     {
1927       if (dump_enabled_p ())
1928         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1929       if (!vect_can_advance_ivs_p (loop_vinfo)
1930           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1931                                            single_exit (LOOP_VINFO_LOOP
1932                                                          (loop_vinfo))))
1933         {
1934           if (dump_enabled_p ())
1935             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1936                              "not vectorized: can't create required "
1937                              "epilog loop\n");
1938           return false;
1939         }
1940     }
1941
1942   gcc_assert (vectorization_factor
1943               == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1944
1945   return true;
1946 }
1947
1948 /* Function vect_analyze_loop.
1949
1950    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1951    for it.  The different analyses will record information in the
1952    loop_vec_info struct.  */
1953 loop_vec_info
1954 vect_analyze_loop (struct loop *loop)
1955 {
1956   loop_vec_info loop_vinfo;
1957   unsigned int vector_sizes;
1958
1959   /* Autodetect first vector size we try.  */
1960   current_vector_size = 0;
1961   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1962
1963   if (dump_enabled_p ())
1964     dump_printf_loc (MSG_NOTE, vect_location,
1965                      "===== analyze_loop_nest =====\n");
1966
1967   if (loop_outer (loop)
1968       && loop_vec_info_for_loop (loop_outer (loop))
1969       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1970     {
1971       if (dump_enabled_p ())
1972         dump_printf_loc (MSG_NOTE, vect_location,
1973                          "outer-loop already vectorized.\n");
1974       return NULL;
1975     }
1976
1977   while (1)
1978     {
1979       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1980       loop_vinfo = vect_analyze_loop_form (loop);
1981       if (!loop_vinfo)
1982         {
1983           if (dump_enabled_p ())
1984             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1985                              "bad loop form.\n");
1986           return NULL;
1987         }
1988
1989       if (vect_analyze_loop_2 (loop_vinfo))
1990         {
1991           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1992
1993           return loop_vinfo;
1994         }
1995
1996       destroy_loop_vec_info (loop_vinfo, true);
1997
1998       vector_sizes &= ~current_vector_size;
1999       if (vector_sizes == 0
2000           || current_vector_size == 0)
2001         return NULL;
2002
2003       /* Try the next biggest vector size.  */
2004       current_vector_size = 1 << floor_log2 (vector_sizes);
2005       if (dump_enabled_p ())
2006         dump_printf_loc (MSG_NOTE, vect_location,
2007                          "***** Re-trying analysis with "
2008                          "vector size %d\n", current_vector_size);
2009     }
2010 }
2011
2012
2013 /* Function reduction_code_for_scalar_code
2014
2015    Input:
2016    CODE - tree_code of a reduction operations.
2017
2018    Output:
2019    REDUC_CODE - the corresponding tree-code to be used to reduce the
2020       vector of partial results into a single scalar result, or ERROR_MARK
2021       if the operation is a supported reduction operation, but does not have
2022       such a tree-code.
2023
2024    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2025
2026 static bool
2027 reduction_code_for_scalar_code (enum tree_code code,
2028                                 enum tree_code *reduc_code)
2029 {
2030   switch (code)
2031     {
2032       case MAX_EXPR:
2033         *reduc_code = REDUC_MAX_EXPR;
2034         return true;
2035
2036       case MIN_EXPR:
2037         *reduc_code = REDUC_MIN_EXPR;
2038         return true;
2039
2040       case PLUS_EXPR:
2041         *reduc_code = REDUC_PLUS_EXPR;
2042         return true;
2043
2044       case MULT_EXPR:
2045       case MINUS_EXPR:
2046       case BIT_IOR_EXPR:
2047       case BIT_XOR_EXPR:
2048       case BIT_AND_EXPR:
2049         *reduc_code = ERROR_MARK;
2050         return true;
2051
2052       default:
2053        return false;
2054     }
2055 }
2056
2057
2058 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2059    STMT is printed with a message MSG. */
2060
2061 static void
2062 report_vect_op (int msg_type, gimple *stmt, const char *msg)
2063 {
2064   dump_printf_loc (msg_type, vect_location, "%s", msg);
2065   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2066   dump_printf (msg_type, "\n");
2067 }
2068
2069
2070 /* Detect SLP reduction of the form:
2071
2072    #a1 = phi <a5, a0>
2073    a2 = operation (a1)
2074    a3 = operation (a2)
2075    a4 = operation (a3)
2076    a5 = operation (a4)
2077
2078    #a = phi <a5>
2079
2080    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2081    FIRST_STMT is the first reduction stmt in the chain
2082    (a2 = operation (a1)).
2083
2084    Return TRUE if a reduction chain was detected.  */
2085
2086 static bool
2087 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2088                        gimple *first_stmt)
2089 {
2090   struct loop *loop = (gimple_bb (phi))->loop_father;
2091   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2092   enum tree_code code;
2093   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2094   stmt_vec_info use_stmt_info, current_stmt_info;
2095   tree lhs;
2096   imm_use_iterator imm_iter;
2097   use_operand_p use_p;
2098   int nloop_uses, size = 0, n_out_of_loop_uses;
2099   bool found = false;
2100
2101   if (loop != vect_loop)
2102     return false;
2103
2104   lhs = PHI_RESULT (phi);
2105   code = gimple_assign_rhs_code (first_stmt);
2106   while (1)
2107     {
2108       nloop_uses = 0;
2109       n_out_of_loop_uses = 0;
2110       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2111         {
2112           gimple *use_stmt = USE_STMT (use_p);
2113           if (is_gimple_debug (use_stmt))
2114             continue;
2115
2116           /* Check if we got back to the reduction phi.  */
2117           if (use_stmt == phi)
2118             {
2119               loop_use_stmt = use_stmt;
2120               found = true;
2121               break;
2122             }
2123
2124           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2125             {
2126               loop_use_stmt = use_stmt;
2127               nloop_uses++;
2128             }
2129            else
2130              n_out_of_loop_uses++;
2131
2132            /* There are can be either a single use in the loop or two uses in
2133               phi nodes.  */
2134            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2135              return false;
2136         }
2137
2138       if (found)
2139         break;
2140
2141       /* We reached a statement with no loop uses.  */
2142       if (nloop_uses == 0)
2143         return false;
2144
2145       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2146       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2147         return false;
2148
2149       if (!is_gimple_assign (loop_use_stmt)
2150           || code != gimple_assign_rhs_code (loop_use_stmt)
2151           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2152         return false;
2153
2154       /* Insert USE_STMT into reduction chain.  */
2155       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2156       if (current_stmt)
2157         {
2158           current_stmt_info = vinfo_for_stmt (current_stmt);
2159           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2160           GROUP_FIRST_ELEMENT (use_stmt_info)
2161             = GROUP_FIRST_ELEMENT (current_stmt_info);
2162         }
2163       else
2164         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2165
2166       lhs = gimple_assign_lhs (loop_use_stmt);
2167       current_stmt = loop_use_stmt;
2168       size++;
2169    }
2170
2171   if (!found || loop_use_stmt != phi || size < 2)
2172     return false;
2173
2174   /* Swap the operands, if needed, to make the reduction operand be the second
2175      operand.  */
2176   lhs = PHI_RESULT (phi);
2177   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2178   while (next_stmt)
2179     {
2180       if (gimple_assign_rhs2 (next_stmt) == lhs)
2181         {
2182           tree op = gimple_assign_rhs1 (next_stmt);
2183           gimple *def_stmt = NULL;
2184
2185           if (TREE_CODE (op) == SSA_NAME)
2186             def_stmt = SSA_NAME_DEF_STMT (op);
2187
2188           /* Check that the other def is either defined in the loop
2189              ("vect_internal_def"), or it's an induction (defined by a
2190              loop-header phi-node).  */
2191           if (def_stmt
2192               && gimple_bb (def_stmt)
2193               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2194               && (is_gimple_assign (def_stmt)
2195                   || is_gimple_call (def_stmt)
2196                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2197                            == vect_induction_def
2198                   || (gimple_code (def_stmt) == GIMPLE_PHI
2199                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2200                                   == vect_internal_def
2201                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2202             {
2203               lhs = gimple_assign_lhs (next_stmt);
2204               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2205               continue;
2206             }
2207
2208           return false;
2209         }
2210       else
2211         {
2212           tree op = gimple_assign_rhs2 (next_stmt);
2213           gimple *def_stmt = NULL;
2214
2215           if (TREE_CODE (op) == SSA_NAME)
2216             def_stmt = SSA_NAME_DEF_STMT (op);
2217
2218           /* Check that the other def is either defined in the loop
2219             ("vect_internal_def"), or it's an induction (defined by a
2220             loop-header phi-node).  */
2221           if (def_stmt
2222               && gimple_bb (def_stmt)
2223               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2224               && (is_gimple_assign (def_stmt)
2225                   || is_gimple_call (def_stmt)
2226                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2227                               == vect_induction_def
2228                   || (gimple_code (def_stmt) == GIMPLE_PHI
2229                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2230                                   == vect_internal_def
2231                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2232             {
2233               if (dump_enabled_p ())
2234                 {
2235                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2236                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2237                   dump_printf (MSG_NOTE, "\n");
2238                 }
2239
2240               swap_ssa_operands (next_stmt,
2241                                  gimple_assign_rhs1_ptr (next_stmt),
2242                                  gimple_assign_rhs2_ptr (next_stmt));
2243               update_stmt (next_stmt);
2244
2245               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2246                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2247             }
2248           else
2249             return false;
2250         }
2251
2252       lhs = gimple_assign_lhs (next_stmt);
2253       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2254     }
2255
2256   /* Save the chain for further analysis in SLP detection.  */
2257   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2258   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2259   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2260
2261   return true;
2262 }
2263
2264
2265 /* Function vect_is_simple_reduction_1
2266
2267    (1) Detect a cross-iteration def-use cycle that represents a simple
2268    reduction computation.  We look for the following pattern:
2269
2270    loop_header:
2271      a1 = phi < a0, a2 >
2272      a3 = ...
2273      a2 = operation (a3, a1)
2274
2275    or
2276
2277    a3 = ...
2278    loop_header:
2279      a1 = phi < a0, a2 >
2280      a2 = operation (a3, a1)
2281
2282    such that:
2283    1. operation is commutative and associative and it is safe to
2284       change the order of the computation (if CHECK_REDUCTION is true)
2285    2. no uses for a2 in the loop (a2 is used out of the loop)
2286    3. no uses of a1 in the loop besides the reduction operation
2287    4. no uses of a1 outside the loop.
2288
2289    Conditions 1,4 are tested here.
2290    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2291
2292    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2293    nested cycles, if CHECK_REDUCTION is false.
2294
2295    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2296    reductions:
2297
2298      a1 = phi < a0, a2 >
2299      inner loop (def of a3)
2300      a2 = phi < a3 >
2301
2302    (4) Detect condition expressions, ie:
2303      for (int i = 0; i < N; i++)
2304        if (a[i] < val)
2305         ret_val = a[i];
2306
2307    If MODIFY is true it tries also to rework the code in-place to enable
2308    detection of more reduction patterns.  For the time being we rewrite
2309    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2310 */
2311
2312 static gimple *
2313 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple *phi,
2314                             bool check_reduction, bool *double_reduc,
2315                             bool modify, bool need_wrapping_integral_overflow,
2316                             enum vect_reduction_type *v_reduc_type)
2317 {
2318   struct loop *loop = (gimple_bb (phi))->loop_father;
2319   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2320   edge latch_e = loop_latch_edge (loop);
2321   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2322   gimple *def_stmt, *def1 = NULL, *def2 = NULL;
2323   enum tree_code orig_code, code;
2324   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2325   tree type;
2326   int nloop_uses;
2327   tree name;
2328   imm_use_iterator imm_iter;
2329   use_operand_p use_p;
2330   bool phi_def;
2331
2332   *double_reduc = false;
2333   *v_reduc_type = TREE_CODE_REDUCTION;
2334
2335   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2336      otherwise, we assume outer loop vectorization.  */
2337   gcc_assert ((check_reduction && loop == vect_loop)
2338               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2339
2340   name = PHI_RESULT (phi);
2341   /* ???  If there are no uses of the PHI result the inner loop reduction
2342      won't be detected as possibly double-reduction by vectorizable_reduction
2343      because that tries to walk the PHI arg from the preheader edge which
2344      can be constant.  See PR60382.  */
2345   if (has_zero_uses (name))
2346     return NULL;
2347   nloop_uses = 0;
2348   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2349     {
2350       gimple *use_stmt = USE_STMT (use_p);
2351       if (is_gimple_debug (use_stmt))
2352         continue;
2353
2354       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2355         {
2356           if (dump_enabled_p ())
2357             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2358                              "intermediate value used outside loop.\n");
2359
2360           return NULL;
2361         }
2362
2363       nloop_uses++;
2364       if (nloop_uses > 1)
2365         {
2366           if (dump_enabled_p ())
2367             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2368                              "reduction used in loop.\n");
2369           return NULL;
2370         }
2371     }
2372
2373   if (TREE_CODE (loop_arg) != SSA_NAME)
2374     {
2375       if (dump_enabled_p ())
2376         {
2377           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2378                            "reduction: not ssa_name: ");
2379           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2380           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2381         }
2382       return NULL;
2383     }
2384
2385   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2386   if (!def_stmt)
2387     {
2388       if (dump_enabled_p ())
2389         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2390                          "reduction: no def_stmt.\n");
2391       return NULL;
2392     }
2393
2394   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2395     {
2396       if (dump_enabled_p ())
2397         {
2398           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2399           dump_printf (MSG_NOTE, "\n");
2400         }
2401       return NULL;
2402     }
2403
2404   if (is_gimple_assign (def_stmt))
2405     {
2406       name = gimple_assign_lhs (def_stmt);
2407       phi_def = false;
2408     }
2409   else
2410     {
2411       name = PHI_RESULT (def_stmt);
2412       phi_def = true;
2413     }
2414
2415   nloop_uses = 0;
2416   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2417     {
2418       gimple *use_stmt = USE_STMT (use_p);
2419       if (is_gimple_debug (use_stmt))
2420         continue;
2421       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2422         nloop_uses++;
2423       if (nloop_uses > 1)
2424         {
2425           if (dump_enabled_p ())
2426             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2427                              "reduction used in loop.\n");
2428           return NULL;
2429         }
2430     }
2431
2432   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2433      defined in the inner loop.  */
2434   if (phi_def)
2435     {
2436       op1 = PHI_ARG_DEF (def_stmt, 0);
2437
2438       if (gimple_phi_num_args (def_stmt) != 1
2439           || TREE_CODE (op1) != SSA_NAME)
2440         {
2441           if (dump_enabled_p ())
2442             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2443                              "unsupported phi node definition.\n");
2444
2445           return NULL;
2446         }
2447
2448       def1 = SSA_NAME_DEF_STMT (op1);
2449       if (gimple_bb (def1)
2450           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2451           && loop->inner
2452           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2453           && is_gimple_assign (def1))
2454         {
2455           if (dump_enabled_p ())
2456             report_vect_op (MSG_NOTE, def_stmt,
2457                             "detected double reduction: ");
2458
2459           *double_reduc = true;
2460           return def_stmt;
2461         }
2462
2463       return NULL;
2464     }
2465
2466   code = orig_code = gimple_assign_rhs_code (def_stmt);
2467
2468   /* We can handle "res -= x[i]", which is non-associative by
2469      simply rewriting this into "res += -x[i]".  Avoid changing
2470      gimple instruction for the first simple tests and only do this
2471      if we're allowed to change code at all.  */
2472   if (code == MINUS_EXPR
2473       && modify
2474       && (op1 = gimple_assign_rhs1 (def_stmt))
2475       && TREE_CODE (op1) == SSA_NAME
2476       && SSA_NAME_DEF_STMT (op1) == phi)
2477     code = PLUS_EXPR;
2478
2479   if (check_reduction)
2480     {
2481       if (code == COND_EXPR)
2482         *v_reduc_type = COND_REDUCTION;
2483       else if (!commutative_tree_code (code) || !associative_tree_code (code))
2484         {
2485           if (dump_enabled_p ())
2486             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2487                             "reduction: not commutative/associative: ");
2488           return NULL;
2489         }
2490     }
2491
2492   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2493     {
2494       if (code != COND_EXPR)
2495         {
2496           if (dump_enabled_p ())
2497             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2498                             "reduction: not binary operation: ");
2499
2500           return NULL;
2501         }
2502
2503       op3 = gimple_assign_rhs1 (def_stmt);
2504       if (COMPARISON_CLASS_P (op3))
2505         {
2506           op4 = TREE_OPERAND (op3, 1);
2507           op3 = TREE_OPERAND (op3, 0);
2508         }
2509
2510       op1 = gimple_assign_rhs2 (def_stmt);
2511       op2 = gimple_assign_rhs3 (def_stmt);
2512
2513       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2514         {
2515           if (dump_enabled_p ())
2516             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2517                             "reduction: uses not ssa_names: ");
2518
2519           return NULL;
2520         }
2521     }
2522   else
2523     {
2524       op1 = gimple_assign_rhs1 (def_stmt);
2525       op2 = gimple_assign_rhs2 (def_stmt);
2526
2527       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2528         {
2529           if (dump_enabled_p ())
2530             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2531                             "reduction: uses not ssa_names: ");
2532
2533           return NULL;
2534         }
2535    }
2536
2537   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2538   if ((TREE_CODE (op1) == SSA_NAME
2539        && !types_compatible_p (type,TREE_TYPE (op1)))
2540       || (TREE_CODE (op2) == SSA_NAME
2541           && !types_compatible_p (type, TREE_TYPE (op2)))
2542       || (op3 && TREE_CODE (op3) == SSA_NAME
2543           && !types_compatible_p (type, TREE_TYPE (op3)))
2544       || (op4 && TREE_CODE (op4) == SSA_NAME
2545           && !types_compatible_p (type, TREE_TYPE (op4))))
2546     {
2547       if (dump_enabled_p ())
2548         {
2549           dump_printf_loc (MSG_NOTE, vect_location,
2550                            "reduction: multiple types: operation type: ");
2551           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2552           dump_printf (MSG_NOTE, ", operands types: ");
2553           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2554                              TREE_TYPE (op1));
2555           dump_printf (MSG_NOTE, ",");
2556           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2557                              TREE_TYPE (op2));
2558           if (op3)
2559             {
2560               dump_printf (MSG_NOTE, ",");
2561               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2562                                  TREE_TYPE (op3));
2563             }
2564
2565           if (op4)
2566             {
2567               dump_printf (MSG_NOTE, ",");
2568               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2569                                  TREE_TYPE (op4));
2570             }
2571           dump_printf (MSG_NOTE, "\n");
2572         }
2573
2574       return NULL;
2575     }
2576
2577   /* Check that it's ok to change the order of the computation.
2578      Generally, when vectorizing a reduction we change the order of the
2579      computation.  This may change the behavior of the program in some
2580      cases, so we need to check that this is ok.  One exception is when
2581      vectorizing an outer-loop: the inner-loop is executed sequentially,
2582      and therefore vectorizing reductions in the inner-loop during
2583      outer-loop vectorization is safe.  */
2584
2585   if (*v_reduc_type != COND_REDUCTION)
2586     {
2587       /* CHECKME: check for !flag_finite_math_only too?  */
2588       if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2589           && check_reduction)
2590         {
2591           /* Changing the order of operations changes the semantics.  */
2592           if (dump_enabled_p ())
2593             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2594                         "reduction: unsafe fp math optimization: ");
2595           return NULL;
2596         }
2597       else if (INTEGRAL_TYPE_P (type) && check_reduction)
2598         {
2599           if (!operation_no_trapping_overflow (type, code))
2600             {
2601               /* Changing the order of operations changes the semantics.  */
2602               if (dump_enabled_p ())
2603                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2604                                 "reduction: unsafe int math optimization"
2605                                 " (overflow traps): ");
2606               return NULL;
2607             }
2608           if (need_wrapping_integral_overflow
2609               && !TYPE_OVERFLOW_WRAPS (type)
2610               && operation_can_overflow (code))
2611             {
2612               /* Changing the order of operations changes the semantics.  */
2613               if (dump_enabled_p ())
2614                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2615                                 "reduction: unsafe int math optimization"
2616                                 " (overflow doesn't wrap): ");
2617               return NULL;
2618             }
2619         }
2620       else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2621         {
2622           /* Changing the order of operations changes the semantics.  */
2623           if (dump_enabled_p ())
2624           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2625                           "reduction: unsafe fixed-point math optimization: ");
2626           return NULL;
2627         }
2628     }
2629
2630   /* If we detected "res -= x[i]" earlier, rewrite it into
2631      "res += -x[i]" now.  If this turns out to be useless reassoc
2632      will clean it up again.  */
2633   if (orig_code == MINUS_EXPR)
2634     {
2635       tree rhs = gimple_assign_rhs2 (def_stmt);
2636       tree negrhs = make_ssa_name (TREE_TYPE (rhs));
2637       gimple *negate_stmt = gimple_build_assign (negrhs, NEGATE_EXPR, rhs);
2638       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2639       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2640                                                           loop_info));
2641       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2642       gimple_assign_set_rhs2 (def_stmt, negrhs);
2643       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2644       update_stmt (def_stmt);
2645     }
2646
2647   /* Reduction is safe. We're dealing with one of the following:
2648      1) integer arithmetic and no trapv
2649      2) floating point arithmetic, and special flags permit this optimization
2650      3) nested cycle (i.e., outer loop vectorization).  */
2651   if (TREE_CODE (op1) == SSA_NAME)
2652     def1 = SSA_NAME_DEF_STMT (op1);
2653
2654   if (TREE_CODE (op2) == SSA_NAME)
2655     def2 = SSA_NAME_DEF_STMT (op2);
2656
2657   if (code != COND_EXPR
2658       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2659     {
2660       if (dump_enabled_p ())
2661         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2662       return NULL;
2663     }
2664
2665   /* Check that one def is the reduction def, defined by PHI,
2666      the other def is either defined in the loop ("vect_internal_def"),
2667      or it's an induction (defined by a loop-header phi-node).  */
2668
2669   if (def2 && def2 == phi
2670       && (code == COND_EXPR
2671           || !def1 || gimple_nop_p (def1)
2672           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2673           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2674               && (is_gimple_assign (def1)
2675                   || is_gimple_call (def1)
2676                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2677                       == vect_induction_def
2678                   || (gimple_code (def1) == GIMPLE_PHI
2679                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2680                           == vect_internal_def
2681                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2682     {
2683       if (dump_enabled_p ())
2684         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2685       return def_stmt;
2686     }
2687
2688   if (def1 && def1 == phi
2689       && (code == COND_EXPR
2690           || !def2 || gimple_nop_p (def2)
2691           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2692           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2693               && (is_gimple_assign (def2)
2694                   || is_gimple_call (def2)
2695                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2696                       == vect_induction_def
2697                   || (gimple_code (def2) == GIMPLE_PHI
2698                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2699                           == vect_internal_def
2700                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2701     {
2702       if (check_reduction)
2703         {
2704           if (code == COND_EXPR)
2705             {
2706               /* No current known use where this case would be useful.  */
2707               if (dump_enabled_p ())
2708                 report_vect_op (MSG_NOTE, def_stmt,
2709                                 "detected reduction: cannot currently swap "
2710                                 "operands for cond_expr");
2711               return NULL;
2712             }
2713
2714           /* Swap operands (just for simplicity - so that the rest of the code
2715              can assume that the reduction variable is always the last (second)
2716              argument).  */
2717           if (dump_enabled_p ())
2718             report_vect_op (MSG_NOTE, def_stmt,
2719                             "detected reduction: need to swap operands: ");
2720
2721           swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2722                              gimple_assign_rhs2_ptr (def_stmt));
2723
2724           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2725             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2726         }
2727       else
2728         {
2729           if (dump_enabled_p ())
2730             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2731         }
2732
2733       return def_stmt;
2734     }
2735
2736   /* Try to find SLP reduction chain.  */
2737   if (check_reduction && code != COND_EXPR
2738       && vect_is_slp_reduction (loop_info, phi, def_stmt))
2739     {
2740       if (dump_enabled_p ())
2741         report_vect_op (MSG_NOTE, def_stmt,
2742                         "reduction: detected reduction chain: ");
2743
2744       return def_stmt;
2745     }
2746
2747   if (dump_enabled_p ())
2748     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2749                     "reduction: unknown pattern: ");
2750
2751   return NULL;
2752 }
2753
2754 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2755    in-place.  Arguments as there.  */
2756
2757 static gimple *
2758 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2759                           bool check_reduction, bool *double_reduc,
2760                           bool need_wrapping_integral_overflow,
2761                           enum vect_reduction_type *v_reduc_type)
2762 {
2763   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2764                                      double_reduc, false,
2765                                      need_wrapping_integral_overflow,
2766                                      v_reduc_type);
2767 }
2768
2769 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2770    in-place if it enables detection of more reductions.  Arguments
2771    as there.  */
2772
2773 gimple *
2774 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
2775                              bool check_reduction, bool *double_reduc,
2776                              bool need_wrapping_integral_overflow)
2777 {
2778   enum vect_reduction_type v_reduc_type;
2779   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2780                                      double_reduc, true,
2781                                      need_wrapping_integral_overflow,
2782                                      &v_reduc_type);
2783 }
2784
2785 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2786 int
2787 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2788                              int *peel_iters_epilogue,
2789                              stmt_vector_for_cost *scalar_cost_vec,
2790                              stmt_vector_for_cost *prologue_cost_vec,
2791                              stmt_vector_for_cost *epilogue_cost_vec)
2792 {
2793   int retval = 0;
2794   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2795
2796   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2797     {
2798       *peel_iters_epilogue = vf/2;
2799       if (dump_enabled_p ())
2800         dump_printf_loc (MSG_NOTE, vect_location,
2801                          "cost model: epilogue peel iters set to vf/2 "
2802                          "because loop iterations are unknown .\n");
2803
2804       /* If peeled iterations are known but number of scalar loop
2805          iterations are unknown, count a taken branch per peeled loop.  */
2806       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2807                                  NULL, 0, vect_prologue);
2808       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2809                                  NULL, 0, vect_epilogue);
2810     }
2811   else
2812     {
2813       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2814       peel_iters_prologue = niters < peel_iters_prologue ?
2815                             niters : peel_iters_prologue;
2816       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2817       /* If we need to peel for gaps, but no peeling is required, we have to
2818          peel VF iterations.  */
2819       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2820         *peel_iters_epilogue = vf;
2821     }
2822
2823   stmt_info_for_cost *si;
2824   int j;
2825   if (peel_iters_prologue)
2826     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2827       retval += record_stmt_cost (prologue_cost_vec,
2828                                   si->count * peel_iters_prologue,
2829                                   si->kind, NULL, si->misalign,
2830                                   vect_prologue);
2831   if (*peel_iters_epilogue)
2832     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2833       retval += record_stmt_cost (epilogue_cost_vec,
2834                                   si->count * *peel_iters_epilogue,
2835                                   si->kind, NULL, si->misalign,
2836                                   vect_epilogue);
2837
2838   return retval;
2839 }
2840
2841 /* Function vect_estimate_min_profitable_iters
2842
2843    Return the number of iterations required for the vector version of the
2844    loop to be profitable relative to the cost of the scalar version of the
2845    loop.  */
2846
2847 static void
2848 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2849                                     int *ret_min_profitable_niters,
2850                                     int *ret_min_profitable_estimate)
2851 {
2852   int min_profitable_iters;
2853   int min_profitable_estimate;
2854   int peel_iters_prologue;
2855   int peel_iters_epilogue;
2856   unsigned vec_inside_cost = 0;
2857   int vec_outside_cost = 0;
2858   unsigned vec_prologue_cost = 0;
2859   unsigned vec_epilogue_cost = 0;
2860   int scalar_single_iter_cost = 0;
2861   int scalar_outside_cost = 0;
2862   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2863   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2864   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2865
2866   /* Cost model disabled.  */
2867   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2868     {
2869       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2870       *ret_min_profitable_niters = 0;
2871       *ret_min_profitable_estimate = 0;
2872       return;
2873     }
2874
2875   /* Requires loop versioning tests to handle misalignment.  */
2876   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2877     {
2878       /*  FIXME: Make cost depend on complexity of individual check.  */
2879       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2880       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2881                             vect_prologue);
2882       dump_printf (MSG_NOTE,
2883                    "cost model: Adding cost of checks for loop "
2884                    "versioning to treat misalignment.\n");
2885     }
2886
2887   /* Requires loop versioning with alias checks.  */
2888   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2889     {
2890       /*  FIXME: Make cost depend on complexity of individual check.  */
2891       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
2892       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2893                             vect_prologue);
2894       dump_printf (MSG_NOTE,
2895                    "cost model: Adding cost of checks for loop "
2896                    "versioning aliasing.\n");
2897     }
2898
2899   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2900       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2901     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2902                           vect_prologue);
2903
2904   /* Count statements in scalar loop.  Using this as scalar cost for a single
2905      iteration for now.
2906
2907      TODO: Add outer loop support.
2908
2909      TODO: Consider assigning different costs to different scalar
2910      statements.  */
2911
2912   scalar_single_iter_cost
2913     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
2914
2915   /* Add additional cost for the peeled instructions in prologue and epilogue
2916      loop.
2917
2918      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2919      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2920
2921      TODO: Build an expression that represents peel_iters for prologue and
2922      epilogue to be used in a run-time test.  */
2923
2924   if (npeel  < 0)
2925     {
2926       peel_iters_prologue = vf/2;
2927       dump_printf (MSG_NOTE, "cost model: "
2928                    "prologue peel iters set to vf/2.\n");
2929
2930       /* If peeling for alignment is unknown, loop bound of main loop becomes
2931          unknown.  */
2932       peel_iters_epilogue = vf/2;
2933       dump_printf (MSG_NOTE, "cost model: "
2934                    "epilogue peel iters set to vf/2 because "
2935                    "peeling for alignment is unknown.\n");
2936
2937       /* If peeled iterations are unknown, count a taken branch and a not taken
2938          branch per peeled loop. Even if scalar loop iterations are known,
2939          vector iterations are not known since peeled prologue iterations are
2940          not known. Hence guards remain the same.  */
2941       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2942                             NULL, 0, vect_prologue);
2943       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2944                             NULL, 0, vect_prologue);
2945       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2946                             NULL, 0, vect_epilogue);
2947       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2948                             NULL, 0, vect_epilogue);
2949       stmt_info_for_cost *si;
2950       int j;
2951       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
2952         {
2953           struct _stmt_vec_info *stmt_info
2954             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2955           (void) add_stmt_cost (target_cost_data,
2956                                 si->count * peel_iters_prologue,
2957                                 si->kind, stmt_info, si->misalign,
2958                                 vect_prologue);
2959           (void) add_stmt_cost (target_cost_data,
2960                                 si->count * peel_iters_epilogue,
2961                                 si->kind, stmt_info, si->misalign,
2962                                 vect_epilogue);
2963         }
2964     }
2965   else
2966     {
2967       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2968       stmt_info_for_cost *si;
2969       int j;
2970       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2971
2972       prologue_cost_vec.create (2);
2973       epilogue_cost_vec.create (2);
2974       peel_iters_prologue = npeel;
2975
2976       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2977                                           &peel_iters_epilogue,
2978                                           &LOOP_VINFO_SCALAR_ITERATION_COST
2979                                             (loop_vinfo),
2980                                           &prologue_cost_vec,
2981                                           &epilogue_cost_vec);
2982
2983       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2984         {
2985           struct _stmt_vec_info *stmt_info
2986             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2987           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2988                                 si->misalign, vect_prologue);
2989         }
2990
2991       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2992         {
2993           struct _stmt_vec_info *stmt_info
2994             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2995           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2996                                 si->misalign, vect_epilogue);
2997         }
2998
2999       prologue_cost_vec.release ();
3000       epilogue_cost_vec.release ();
3001     }
3002
3003   /* FORNOW: The scalar outside cost is incremented in one of the
3004      following ways:
3005
3006      1. The vectorizer checks for alignment and aliasing and generates
3007      a condition that allows dynamic vectorization.  A cost model
3008      check is ANDED with the versioning condition.  Hence scalar code
3009      path now has the added cost of the versioning check.
3010
3011        if (cost > th & versioning_check)
3012          jmp to vector code
3013
3014      Hence run-time scalar is incremented by not-taken branch cost.
3015
3016      2. The vectorizer then checks if a prologue is required.  If the
3017      cost model check was not done before during versioning, it has to
3018      be done before the prologue check.
3019
3020        if (cost <= th)
3021          prologue = scalar_iters
3022        if (prologue == 0)
3023          jmp to vector code
3024        else
3025          execute prologue
3026        if (prologue == num_iters)
3027          go to exit
3028
3029      Hence the run-time scalar cost is incremented by a taken branch,
3030      plus a not-taken branch, plus a taken branch cost.
3031
3032      3. The vectorizer then checks if an epilogue is required.  If the
3033      cost model check was not done before during prologue check, it
3034      has to be done with the epilogue check.
3035
3036        if (prologue == 0)
3037          jmp to vector code
3038        else
3039          execute prologue
3040        if (prologue == num_iters)
3041          go to exit
3042        vector code:
3043          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3044            jmp to epilogue
3045
3046      Hence the run-time scalar cost should be incremented by 2 taken
3047      branches.
3048
3049      TODO: The back end may reorder the BBS's differently and reverse
3050      conditions/branch directions.  Change the estimates below to
3051      something more reasonable.  */
3052
3053   /* If the number of iterations is known and we do not do versioning, we can
3054      decide whether to vectorize at compile time.  Hence the scalar version
3055      do not carry cost model guard costs.  */
3056   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3057       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3058       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3059     {
3060       /* Cost model check occurs at versioning.  */
3061       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3062           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3063         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3064       else
3065         {
3066           /* Cost model check occurs at prologue generation.  */
3067           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3068             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3069               + vect_get_stmt_cost (cond_branch_not_taken);
3070           /* Cost model check occurs at epilogue generation.  */
3071           else
3072             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3073         }
3074     }
3075
3076   /* Complete the target-specific cost calculations.  */
3077   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3078                &vec_inside_cost, &vec_epilogue_cost);
3079
3080   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3081
3082   if (dump_enabled_p ())
3083     {
3084       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3085       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3086                    vec_inside_cost);
3087       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3088                    vec_prologue_cost);
3089       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3090                    vec_epilogue_cost);
3091       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3092                    scalar_single_iter_cost);
3093       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3094                    scalar_outside_cost);
3095       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3096                    vec_outside_cost);
3097       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3098                    peel_iters_prologue);
3099       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3100                    peel_iters_epilogue);
3101     }
3102
3103   /* Calculate number of iterations required to make the vector version
3104      profitable, relative to the loop bodies only.  The following condition
3105      must hold true:
3106      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3107      where
3108      SIC = scalar iteration cost, VIC = vector iteration cost,
3109      VOC = vector outside cost, VF = vectorization factor,
3110      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3111      SOC = scalar outside cost for run time cost model check.  */
3112
3113   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3114     {
3115       if (vec_outside_cost <= 0)
3116         min_profitable_iters = 1;
3117       else
3118         {
3119           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3120                                   - vec_inside_cost * peel_iters_prologue
3121                                   - vec_inside_cost * peel_iters_epilogue)
3122                                  / ((scalar_single_iter_cost * vf)
3123                                     - vec_inside_cost);
3124
3125           if ((scalar_single_iter_cost * vf * min_profitable_iters)
3126               <= (((int) vec_inside_cost * min_profitable_iters)
3127                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3128             min_profitable_iters++;
3129         }
3130     }
3131   /* vector version will never be profitable.  */
3132   else
3133     {
3134       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3135         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3136                     "did not happen for a simd loop");
3137
3138       if (dump_enabled_p ())
3139         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3140                          "cost model: the vector iteration cost = %d "
3141                          "divided by the scalar iteration cost = %d "
3142                          "is greater or equal to the vectorization factor = %d"
3143                          ".\n",
3144                          vec_inside_cost, scalar_single_iter_cost, vf);
3145       *ret_min_profitable_niters = -1;
3146       *ret_min_profitable_estimate = -1;
3147       return;
3148     }
3149
3150   dump_printf (MSG_NOTE,
3151                "  Calculated minimum iters for profitability: %d\n",
3152                min_profitable_iters);
3153
3154   min_profitable_iters =
3155         min_profitable_iters < vf ? vf : min_profitable_iters;
3156
3157   /* Because the condition we create is:
3158      if (niters <= min_profitable_iters)
3159        then skip the vectorized loop.  */
3160   min_profitable_iters--;
3161
3162   if (dump_enabled_p ())
3163     dump_printf_loc (MSG_NOTE, vect_location,
3164                      "  Runtime profitability threshold = %d\n",
3165                      min_profitable_iters);
3166
3167   *ret_min_profitable_niters = min_profitable_iters;
3168
3169   /* Calculate number of iterations required to make the vector version
3170      profitable, relative to the loop bodies only.
3171
3172      Non-vectorized variant is SIC * niters and it must win over vector
3173      variant on the expected loop trip count.  The following condition must hold true:
3174      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3175
3176   if (vec_outside_cost <= 0)
3177     min_profitable_estimate = 1;
3178   else
3179     {
3180       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3181                                  - vec_inside_cost * peel_iters_prologue
3182                                  - vec_inside_cost * peel_iters_epilogue)
3183                                  / ((scalar_single_iter_cost * vf)
3184                                    - vec_inside_cost);
3185     }
3186   min_profitable_estimate --;
3187   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3188   if (dump_enabled_p ())
3189     dump_printf_loc (MSG_NOTE, vect_location,
3190                      "  Static estimate profitability threshold = %d\n",
3191                       min_profitable_iters);
3192
3193   *ret_min_profitable_estimate = min_profitable_estimate;
3194 }
3195
3196 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3197    vector elements (not bits) for a vector of mode MODE.  */
3198 static void
3199 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3200                               unsigned char *sel)
3201 {
3202   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3203
3204   for (i = 0; i < nelt; i++)
3205     sel[i] = (i + offset) & (2*nelt - 1);
3206 }
3207
3208 /* Checks whether the target supports whole-vector shifts for vectors of mode
3209    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3210    it supports vec_perm_const with masks for all necessary shift amounts.  */
3211 static bool
3212 have_whole_vector_shift (enum machine_mode mode)
3213 {
3214   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3215     return true;
3216
3217   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3218     return false;
3219
3220   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3221   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3222
3223   for (i = nelt/2; i >= 1; i/=2)
3224     {
3225       calc_vec_perm_mask_for_shift (mode, i, sel);
3226       if (!can_vec_perm_p (mode, false, sel))
3227         return false;
3228     }
3229   return true;
3230 }
3231
3232 /* Return the reduction operand (with index REDUC_INDEX) of STMT.  */
3233
3234 static tree
3235 get_reduction_op (gimple *stmt, int reduc_index)
3236 {
3237   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3238     {
3239     case GIMPLE_SINGLE_RHS:
3240       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3241                   == ternary_op);
3242       return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3243     case GIMPLE_UNARY_RHS:
3244       return gimple_assign_rhs1 (stmt);
3245     case GIMPLE_BINARY_RHS:
3246       return (reduc_index
3247               ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3248     case GIMPLE_TERNARY_RHS:
3249       return gimple_op (stmt, reduc_index + 1);
3250     default:
3251       gcc_unreachable ();
3252     }
3253 }
3254
3255 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3256    functions. Design better to avoid maintenance issues.  */
3257
3258 /* Function vect_model_reduction_cost.
3259
3260    Models cost for a reduction operation, including the vector ops
3261    generated within the strip-mine loop, the initial definition before
3262    the loop, and the epilogue code that must be generated.  */
3263
3264 static bool
3265 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3266                            int ncopies, int reduc_index)
3267 {
3268   int prologue_cost = 0, epilogue_cost = 0;
3269   enum tree_code code;
3270   optab optab;
3271   tree vectype;
3272   gimple *stmt, *orig_stmt;
3273   tree reduction_op;
3274   machine_mode mode;
3275   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3276   struct loop *loop = NULL;
3277   void *target_cost_data;
3278
3279   if (loop_vinfo)
3280     {
3281       loop = LOOP_VINFO_LOOP (loop_vinfo);
3282       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3283     }
3284   else
3285     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3286
3287   /* Condition reductions generate two reductions in the loop.  */
3288   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3289     ncopies *= 2;
3290
3291   /* Cost of reduction op inside loop.  */
3292   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3293                                         stmt_info, 0, vect_body);
3294   stmt = STMT_VINFO_STMT (stmt_info);
3295
3296   reduction_op = get_reduction_op (stmt, reduc_index);
3297
3298   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3299   if (!vectype)
3300     {
3301       if (dump_enabled_p ())
3302         {
3303           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3304                            "unsupported data-type ");
3305           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3306                              TREE_TYPE (reduction_op));
3307           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3308         }
3309       return false;
3310    }
3311
3312   mode = TYPE_MODE (vectype);
3313   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3314
3315   if (!orig_stmt)
3316     orig_stmt = STMT_VINFO_STMT (stmt_info);
3317
3318   code = gimple_assign_rhs_code (orig_stmt);
3319
3320   /* Add in cost for initial definition.
3321      For cond reduction we have four vectors: initial index, step, initial
3322      result of the data reduction, initial value of the index reduction.  */
3323   int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3324                        == COND_REDUCTION ? 4 : 1;
3325   prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3326                                   scalar_to_vec, stmt_info, 0,
3327                                   vect_prologue);
3328
3329   /* Determine cost of epilogue code.
3330
3331      We have a reduction operator that will reduce the vector in one statement.
3332      Also requires scalar extract.  */
3333
3334   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3335     {
3336       if (reduc_code != ERROR_MARK)
3337         {
3338           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3339             {
3340               /* An EQ stmt and an COND_EXPR stmt.  */
3341               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3342                                               vector_stmt, stmt_info, 0,
3343                                               vect_epilogue);
3344               /* Reduction of the max index and a reduction of the found
3345                  values.  */
3346               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3347                                               vec_to_scalar, stmt_info, 0,
3348                                               vect_epilogue);
3349               /* A broadcast of the max value.  */
3350               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3351                                               scalar_to_vec, stmt_info, 0,
3352                                               vect_epilogue);
3353             }
3354           else
3355             {
3356               epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3357                                               stmt_info, 0, vect_epilogue);
3358               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3359                                               vec_to_scalar, stmt_info, 0,
3360                                               vect_epilogue);
3361             }
3362         }
3363       else
3364         {
3365           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3366           tree bitsize =
3367             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3368           int element_bitsize = tree_to_uhwi (bitsize);
3369           int nelements = vec_size_in_bits / element_bitsize;
3370
3371           optab = optab_for_tree_code (code, vectype, optab_default);
3372
3373           /* We have a whole vector shift available.  */
3374           if (VECTOR_MODE_P (mode)
3375               && optab_handler (optab, mode) != CODE_FOR_nothing
3376               && have_whole_vector_shift (mode))
3377             {
3378               /* Final reduction via vector shifts and the reduction operator.
3379                  Also requires scalar extract.  */
3380               epilogue_cost += add_stmt_cost (target_cost_data,
3381                                               exact_log2 (nelements) * 2,
3382                                               vector_stmt, stmt_info, 0,
3383                                               vect_epilogue);
3384               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3385                                               vec_to_scalar, stmt_info, 0,
3386                                               vect_epilogue);
3387             }
3388           else
3389             /* Use extracts and reduction op for final reduction.  For N
3390                elements, we have N extracts and N-1 reduction ops.  */
3391             epilogue_cost += add_stmt_cost (target_cost_data,
3392                                             nelements + nelements - 1,
3393                                             vector_stmt, stmt_info, 0,
3394                                             vect_epilogue);
3395         }
3396     }
3397
3398   if (dump_enabled_p ())
3399     dump_printf (MSG_NOTE,
3400                  "vect_model_reduction_cost: inside_cost = %d, "
3401                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3402                  prologue_cost, epilogue_cost);
3403
3404   return true;
3405 }
3406
3407
3408 /* Function vect_model_induction_cost.
3409
3410    Models cost for induction operations.  */
3411
3412 static void
3413 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3414 {
3415   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3416   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3417   unsigned inside_cost, prologue_cost;
3418
3419   /* loop cost for vec_loop.  */
3420   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3421                                stmt_info, 0, vect_body);
3422
3423   /* prologue cost for vec_init and vec_step.  */
3424   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3425                                  stmt_info, 0, vect_prologue);
3426
3427   if (dump_enabled_p ())
3428     dump_printf_loc (MSG_NOTE, vect_location,
3429                      "vect_model_induction_cost: inside_cost = %d, "
3430                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3431 }
3432
3433
3434 /* Function get_initial_def_for_induction
3435
3436    Input:
3437    STMT - a stmt that performs an induction operation in the loop.
3438    IV_PHI - the initial value of the induction variable
3439
3440    Output:
3441    Return a vector variable, initialized with the first VF values of
3442    the induction variable.  E.g., for an iv with IV_PHI='X' and
3443    evolution S, for a vector of 4 units, we want to return:
3444    [X, X + S, X + 2*S, X + 3*S].  */
3445
3446 static tree
3447 get_initial_def_for_induction (gimple *iv_phi)
3448 {
3449   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3450   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3451   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3452   tree vectype;
3453   int nunits;
3454   edge pe = loop_preheader_edge (loop);
3455   struct loop *iv_loop;
3456   basic_block new_bb;
3457   tree new_vec, vec_init, vec_step, t;
3458   tree new_name;
3459   gimple *new_stmt;
3460   gphi *induction_phi;
3461   tree induc_def, vec_def, vec_dest;
3462   tree init_expr, step_expr;
3463   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3464   int i;
3465   int ncopies;
3466   tree expr;
3467   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3468   bool nested_in_vect_loop = false;
3469   gimple_seq stmts;
3470   imm_use_iterator imm_iter;
3471   use_operand_p use_p;
3472   gimple *exit_phi;
3473   edge latch_e;
3474   tree loop_arg;
3475   gimple_stmt_iterator si;
3476   basic_block bb = gimple_bb (iv_phi);
3477   tree stepvectype;
3478   tree resvectype;
3479
3480   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3481   if (nested_in_vect_loop_p (loop, iv_phi))
3482     {
3483       nested_in_vect_loop = true;
3484       iv_loop = loop->inner;
3485     }
3486   else
3487     iv_loop = loop;
3488   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3489
3490   latch_e = loop_latch_edge (iv_loop);
3491   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3492
3493   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3494   gcc_assert (step_expr != NULL_TREE);
3495
3496   pe = loop_preheader_edge (iv_loop);
3497   init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3498                                      loop_preheader_edge (iv_loop));
3499
3500   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3501   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3502   gcc_assert (vectype);
3503   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3504   ncopies = vf / nunits;
3505
3506   gcc_assert (phi_info);
3507   gcc_assert (ncopies >= 1);
3508
3509   /* Convert the step to the desired type.  */
3510   stmts = NULL;
3511   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
3512   if (stmts)
3513     {
3514       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3515       gcc_assert (!new_bb);
3516     }
3517
3518   /* Find the first insertion point in the BB.  */
3519   si = gsi_after_labels (bb);
3520
3521   /* Create the vector that holds the initial_value of the induction.  */
3522   if (nested_in_vect_loop)
3523     {
3524       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3525          been created during vectorization of previous stmts.  We obtain it
3526          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3527       vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi);
3528       /* If the initial value is not of proper type, convert it.  */
3529       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3530         {
3531           new_stmt
3532             = gimple_build_assign (vect_get_new_ssa_name (vectype,
3533                                                           vect_simple_var,
3534                                                           "vec_iv_"),
3535                                    VIEW_CONVERT_EXPR,
3536                                    build1 (VIEW_CONVERT_EXPR, vectype,
3537                                            vec_init));
3538           vec_init = gimple_assign_lhs (new_stmt);
3539           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3540                                                  new_stmt);
3541           gcc_assert (!new_bb);
3542           set_vinfo_for_stmt (new_stmt,
3543                               new_stmt_vec_info (new_stmt, loop_vinfo));
3544         }
3545     }
3546   else
3547     {
3548       vec<constructor_elt, va_gc> *v;
3549
3550       /* iv_loop is the loop to be vectorized. Create:
3551          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3552       stmts = NULL;
3553       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
3554
3555       vec_alloc (v, nunits);
3556       bool constant_p = is_gimple_min_invariant (new_name);
3557       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3558       for (i = 1; i < nunits; i++)
3559         {
3560           /* Create: new_name_i = new_name + step_expr  */
3561           new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
3562                                    new_name, step_expr);
3563           if (!is_gimple_min_invariant (new_name))
3564             constant_p = false;
3565           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3566         }
3567       if (stmts)
3568         {
3569           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3570           gcc_assert (!new_bb);
3571         }
3572
3573       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3574       if (constant_p)
3575         new_vec = build_vector_from_ctor (vectype, v);
3576       else
3577         new_vec = build_constructor (vectype, v);
3578       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3579     }
3580
3581
3582   /* Create the vector that holds the step of the induction.  */
3583   if (nested_in_vect_loop)
3584     /* iv_loop is nested in the loop to be vectorized. Generate:
3585        vec_step = [S, S, S, S]  */
3586     new_name = step_expr;
3587   else
3588     {
3589       /* iv_loop is the loop to be vectorized. Generate:
3590           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3591       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3592         {
3593           expr = build_int_cst (integer_type_node, vf);
3594           expr = fold_convert (TREE_TYPE (step_expr), expr);
3595         }
3596       else
3597         expr = build_int_cst (TREE_TYPE (step_expr), vf);
3598       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3599                               expr, step_expr);
3600       if (TREE_CODE (step_expr) == SSA_NAME)
3601         new_name = vect_init_vector (iv_phi, new_name,
3602                                      TREE_TYPE (step_expr), NULL);
3603     }
3604
3605   t = unshare_expr (new_name);
3606   gcc_assert (CONSTANT_CLASS_P (new_name)
3607               || TREE_CODE (new_name) == SSA_NAME);
3608   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3609   gcc_assert (stepvectype);
3610   new_vec = build_vector_from_val (stepvectype, t);
3611   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3612
3613
3614   /* Create the following def-use cycle:
3615      loop prolog:
3616          vec_init = ...
3617          vec_step = ...
3618      loop:
3619          vec_iv = PHI <vec_init, vec_loop>
3620          ...
3621          STMT
3622          ...
3623          vec_loop = vec_iv + vec_step;  */
3624
3625   /* Create the induction-phi that defines the induction-operand.  */
3626   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3627   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3628   set_vinfo_for_stmt (induction_phi,
3629                       new_stmt_vec_info (induction_phi, loop_vinfo));
3630   induc_def = PHI_RESULT (induction_phi);
3631
3632   /* Create the iv update inside the loop  */
3633   new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3634   vec_def = make_ssa_name (vec_dest, new_stmt);
3635   gimple_assign_set_lhs (new_stmt, vec_def);
3636   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3637   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
3638
3639   /* Set the arguments of the phi node:  */
3640   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3641   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3642                UNKNOWN_LOCATION);
3643
3644
3645   /* In case that vectorization factor (VF) is bigger than the number
3646      of elements that we can fit in a vectype (nunits), we have to generate
3647      more than one vector stmt - i.e - we need to "unroll" the
3648      vector stmt by a factor VF/nunits.  For more details see documentation
3649      in vectorizable_operation.  */
3650
3651   if (ncopies > 1)
3652     {
3653       stmt_vec_info prev_stmt_vinfo;
3654       /* FORNOW. This restriction should be relaxed.  */
3655       gcc_assert (!nested_in_vect_loop);
3656
3657       /* Create the vector that holds the step of the induction.  */
3658       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3659         {
3660           expr = build_int_cst (integer_type_node, nunits);
3661           expr = fold_convert (TREE_TYPE (step_expr), expr);
3662         }
3663       else
3664         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3665       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3666                               expr, step_expr);
3667       if (TREE_CODE (step_expr) == SSA_NAME)
3668         new_name = vect_init_vector (iv_phi, new_name,
3669                                      TREE_TYPE (step_expr), NULL);
3670       t = unshare_expr (new_name);
3671       gcc_assert (CONSTANT_CLASS_P (new_name)
3672                   || TREE_CODE (new_name) == SSA_NAME);
3673       new_vec = build_vector_from_val (stepvectype, t);
3674       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3675
3676       vec_def = induc_def;
3677       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3678       for (i = 1; i < ncopies; i++)
3679         {
3680           /* vec_i = vec_prev + vec_step  */
3681           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3682                                           vec_def, vec_step);
3683           vec_def = make_ssa_name (vec_dest, new_stmt);
3684           gimple_assign_set_lhs (new_stmt, vec_def);
3685
3686           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3687           if (!useless_type_conversion_p (resvectype, vectype))
3688             {
3689               new_stmt
3690                 = gimple_build_assign
3691                         (vect_get_new_vect_var (resvectype, vect_simple_var,
3692                                                 "vec_iv_"),
3693                          VIEW_CONVERT_EXPR,
3694                          build1 (VIEW_CONVERT_EXPR, resvectype,
3695                                  gimple_assign_lhs (new_stmt)));
3696               gimple_assign_set_lhs (new_stmt,
3697                                      make_ssa_name
3698                                        (gimple_assign_lhs (new_stmt), new_stmt));
3699               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3700             }
3701           set_vinfo_for_stmt (new_stmt,
3702                               new_stmt_vec_info (new_stmt, loop_vinfo));
3703           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3704           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3705         }
3706     }
3707
3708   if (nested_in_vect_loop)
3709     {
3710       /* Find the loop-closed exit-phi of the induction, and record
3711          the final vector of induction results:  */
3712       exit_phi = NULL;
3713       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3714         {
3715           gimple *use_stmt = USE_STMT (use_p);
3716           if (is_gimple_debug (use_stmt))
3717             continue;
3718
3719           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3720             {
3721               exit_phi = use_stmt;
3722               break;
3723             }
3724         }
3725       if (exit_phi)
3726         {
3727           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3728           /* FORNOW. Currently not supporting the case that an inner-loop induction
3729              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3730           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3731                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3732
3733           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3734           if (dump_enabled_p ())
3735             {
3736               dump_printf_loc (MSG_NOTE, vect_location,
3737                                "vector of inductions after inner-loop:");
3738               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3739               dump_printf (MSG_NOTE, "\n");
3740             }
3741         }
3742     }
3743
3744
3745   if (dump_enabled_p ())
3746     {
3747       dump_printf_loc (MSG_NOTE, vect_location,
3748                        "transform induction: created def-use cycle: ");
3749       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3750       dump_printf (MSG_NOTE, "\n");
3751       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3752                         SSA_NAME_DEF_STMT (vec_def), 0);
3753       dump_printf (MSG_NOTE, "\n");
3754     }
3755
3756   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3757   if (!useless_type_conversion_p (resvectype, vectype))
3758     {
3759       new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
3760                                                              vect_simple_var,
3761                                                              "vec_iv_"),
3762                                       VIEW_CONVERT_EXPR,
3763                                       build1 (VIEW_CONVERT_EXPR, resvectype,
3764                                               induc_def));
3765       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3766       gimple_assign_set_lhs (new_stmt, induc_def);
3767       si = gsi_after_labels (bb);
3768       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3769       set_vinfo_for_stmt (new_stmt,
3770                           new_stmt_vec_info (new_stmt, loop_vinfo));
3771       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3772         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3773     }
3774
3775   return induc_def;
3776 }
3777
3778
3779 /* Function get_initial_def_for_reduction
3780
3781    Input:
3782    STMT - a stmt that performs a reduction operation in the loop.
3783    INIT_VAL - the initial value of the reduction variable
3784
3785    Output:
3786    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3787         of the reduction (used for adjusting the epilog - see below).
3788    Return a vector variable, initialized according to the operation that STMT
3789         performs. This vector will be used as the initial value of the
3790         vector of partial results.
3791
3792    Option1 (adjust in epilog): Initialize the vector as follows:
3793      add/bit or/xor:    [0,0,...,0,0]
3794      mult/bit and:      [1,1,...,1,1]
3795      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3796    and when necessary (e.g. add/mult case) let the caller know
3797    that it needs to adjust the result by init_val.
3798
3799    Option2: Initialize the vector as follows:
3800      add/bit or/xor:    [init_val,0,0,...,0]
3801      mult/bit and:      [init_val,1,1,...,1]
3802      min/max/cond_expr: [init_val,init_val,...,init_val]
3803    and no adjustments are needed.
3804
3805    For example, for the following code:
3806
3807    s = init_val;
3808    for (i=0;i<n;i++)
3809      s = s + a[i];
3810
3811    STMT is 's = s + a[i]', and the reduction variable is 's'.
3812    For a vector of 4 units, we want to return either [0,0,0,init_val],
3813    or [0,0,0,0] and let the caller know that it needs to adjust
3814    the result at the end by 'init_val'.
3815
3816    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3817    initialization vector is simpler (same element in all entries), if
3818    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3819
3820    A cost model should help decide between these two schemes.  */
3821
3822 tree
3823 get_initial_def_for_reduction (gimple *stmt, tree init_val,
3824                                tree *adjustment_def)
3825 {
3826   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3827   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3828   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3829   tree scalar_type = TREE_TYPE (init_val);
3830   tree vectype = get_vectype_for_scalar_type (scalar_type);
3831   int nunits;
3832   enum tree_code code = gimple_assign_rhs_code (stmt);
3833   tree def_for_init;
3834   tree init_def;
3835   tree *elts;
3836   int i;
3837   bool nested_in_vect_loop = false;
3838   tree init_value;
3839   REAL_VALUE_TYPE real_init_val = dconst0;
3840   int int_init_val = 0;
3841   gimple *def_stmt = NULL;
3842
3843   gcc_assert (vectype);
3844   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3845
3846   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3847               || SCALAR_FLOAT_TYPE_P (scalar_type));
3848
3849   if (nested_in_vect_loop_p (loop, stmt))
3850     nested_in_vect_loop = true;
3851   else
3852     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3853
3854   /* In case of double reduction we only create a vector variable to be put
3855      in the reduction phi node.  The actual statement creation is done in
3856      vect_create_epilog_for_reduction.  */
3857   if (adjustment_def && nested_in_vect_loop
3858       && TREE_CODE (init_val) == SSA_NAME
3859       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3860       && gimple_code (def_stmt) == GIMPLE_PHI
3861       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3862       && vinfo_for_stmt (def_stmt)
3863       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3864           == vect_double_reduction_def)
3865     {
3866       *adjustment_def = NULL;
3867       return vect_create_destination_var (init_val, vectype);
3868     }
3869
3870   if (TREE_CONSTANT (init_val))
3871     {
3872       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3873         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3874       else
3875         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3876     }
3877   else
3878     init_value = init_val;
3879
3880   switch (code)
3881     {
3882       case WIDEN_SUM_EXPR:
3883       case DOT_PROD_EXPR:
3884       case SAD_EXPR:
3885       case PLUS_EXPR:
3886       case MINUS_EXPR:
3887       case BIT_IOR_EXPR:
3888       case BIT_XOR_EXPR:
3889       case MULT_EXPR:
3890       case BIT_AND_EXPR:
3891         /* ADJUSMENT_DEF is NULL when called from
3892            vect_create_epilog_for_reduction to vectorize double reduction.  */
3893         if (adjustment_def)
3894           {
3895             if (nested_in_vect_loop)
3896               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt);
3897             else
3898               *adjustment_def = init_val;
3899           }
3900
3901         if (code == MULT_EXPR)
3902           {
3903             real_init_val = dconst1;
3904             int_init_val = 1;
3905           }
3906
3907         if (code == BIT_AND_EXPR)
3908           int_init_val = -1;
3909
3910         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3911           def_for_init = build_real (scalar_type, real_init_val);
3912         else
3913           def_for_init = build_int_cst (scalar_type, int_init_val);
3914
3915         /* Create a vector of '0' or '1' except the first element.  */
3916         elts = XALLOCAVEC (tree, nunits);
3917         for (i = nunits - 2; i >= 0; --i)
3918           elts[i + 1] = def_for_init;
3919
3920         /* Option1: the first element is '0' or '1' as well.  */
3921         if (adjustment_def)
3922           {
3923             elts[0] = def_for_init;
3924             init_def = build_vector (vectype, elts);
3925             break;
3926           }
3927
3928         /* Option2: the first element is INIT_VAL.  */
3929         elts[0] = init_val;
3930         if (TREE_CONSTANT (init_val))
3931           init_def = build_vector (vectype, elts);
3932         else
3933           {
3934             vec<constructor_elt, va_gc> *v;
3935             vec_alloc (v, nunits);
3936             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3937             for (i = 1; i < nunits; ++i)
3938               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3939             init_def = build_constructor (vectype, v);
3940           }
3941
3942         break;
3943
3944       case MIN_EXPR:
3945       case MAX_EXPR:
3946       case COND_EXPR:
3947         if (adjustment_def)
3948           {
3949             *adjustment_def = NULL_TREE;
3950             if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
3951               {
3952                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
3953                 break;
3954               }
3955           }
3956         init_def = build_vector_from_val (vectype, init_value);
3957         break;
3958
3959       default:
3960         gcc_unreachable ();
3961     }
3962
3963   return init_def;
3964 }
3965
3966 /* Function vect_create_epilog_for_reduction
3967
3968    Create code at the loop-epilog to finalize the result of a reduction
3969    computation.
3970
3971    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3972      reduction statements.
3973    STMT is the scalar reduction stmt that is being vectorized.
3974    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3975      number of elements that we can fit in a vectype (nunits).  In this case
3976      we have to generate more than one vector stmt - i.e - we need to "unroll"
3977      the vector stmt by a factor VF/nunits.  For more details see documentation
3978      in vectorizable_operation.
3979    REDUC_CODE is the tree-code for the epilog reduction.
3980    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3981      computation.
3982    REDUC_INDEX is the index of the operand in the right hand side of the
3983      statement that is defined by REDUCTION_PHI.
3984    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3985    SLP_NODE is an SLP node containing a group of reduction statements. The
3986      first one in this group is STMT.
3987    INDUCTION_INDEX is the index of the loop for condition reductions.
3988      Otherwise it is undefined.
3989
3990    This function:
3991    1. Creates the reduction def-use cycles: sets the arguments for
3992       REDUCTION_PHIS:
3993       The loop-entry argument is the vectorized initial-value of the reduction.
3994       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3995       sums.
3996    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3997       by applying the operation specified by REDUC_CODE if available, or by
3998       other means (whole-vector shifts or a scalar loop).
3999       The function also creates a new phi node at the loop exit to preserve
4000       loop-closed form, as illustrated below.
4001
4002      The flow at the entry to this function:
4003
4004         loop:
4005           vec_def = phi <null, null>            # REDUCTION_PHI
4006           VECT_DEF = vector_stmt                # vectorized form of STMT
4007           s_loop = scalar_stmt                  # (scalar) STMT
4008         loop_exit:
4009           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4010           use <s_out0>
4011           use <s_out0>
4012
4013      The above is transformed by this function into:
4014
4015         loop:
4016           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4017           VECT_DEF = vector_stmt                # vectorized form of STMT
4018           s_loop = scalar_stmt                  # (scalar) STMT
4019         loop_exit:
4020           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4021           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4022           v_out2 = reduce <v_out1>
4023           s_out3 = extract_field <v_out2, 0>
4024           s_out4 = adjust_result <s_out3>
4025           use <s_out4>
4026           use <s_out4>
4027 */
4028
4029 static void
4030 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4031                                   int ncopies, enum tree_code reduc_code,
4032                                   vec<gimple *> reduction_phis,
4033                                   int reduc_index, bool double_reduc,
4034                                   slp_tree slp_node, tree induction_index)
4035 {
4036   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4037   stmt_vec_info prev_phi_info;
4038   tree vectype;
4039   machine_mode mode;
4040   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4041   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4042   basic_block exit_bb;
4043   tree scalar_dest;
4044   tree scalar_type;
4045   gimple *new_phi = NULL, *phi;
4046   gimple_stmt_iterator exit_gsi;
4047   tree vec_dest;
4048   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4049   gimple *epilog_stmt = NULL;
4050   enum tree_code code = gimple_assign_rhs_code (stmt);
4051   gimple *exit_phi;
4052   tree bitsize;
4053   tree adjustment_def = NULL;
4054   tree vec_initial_def = NULL;
4055   tree reduction_op, expr, def;
4056   tree orig_name, scalar_result;
4057   imm_use_iterator imm_iter, phi_imm_iter;
4058   use_operand_p use_p, phi_use_p;
4059   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4060   bool nested_in_vect_loop = false;
4061   auto_vec<gimple *> new_phis;
4062   auto_vec<gimple *> inner_phis;
4063   enum vect_def_type dt = vect_unknown_def_type;
4064   int j, i;
4065   auto_vec<tree> scalar_results;
4066   unsigned int group_size = 1, k, ratio;
4067   auto_vec<tree> vec_initial_defs;
4068   auto_vec<gimple *> phis;
4069   bool slp_reduc = false;
4070   tree new_phi_result;
4071   gimple *inner_phi = NULL;
4072
4073   if (slp_node)
4074     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4075
4076   if (nested_in_vect_loop_p (loop, stmt))
4077     {
4078       outer_loop = loop;
4079       loop = loop->inner;
4080       nested_in_vect_loop = true;
4081       gcc_assert (!slp_node);
4082     }
4083
4084   reduction_op = get_reduction_op (stmt, reduc_index);
4085
4086   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4087   gcc_assert (vectype);
4088   mode = TYPE_MODE (vectype);
4089
4090   /* 1. Create the reduction def-use cycle:
4091      Set the arguments of REDUCTION_PHIS, i.e., transform
4092
4093         loop:
4094           vec_def = phi <null, null>            # REDUCTION_PHI
4095           VECT_DEF = vector_stmt                # vectorized form of STMT
4096           ...
4097
4098      into:
4099
4100         loop:
4101           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4102           VECT_DEF = vector_stmt                # vectorized form of STMT
4103           ...
4104
4105      (in case of SLP, do it for all the phis). */
4106
4107   /* Get the loop-entry arguments.  */
4108   if (slp_node)
4109     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4110                        NULL, slp_node, reduc_index);
4111   else
4112     {
4113       /* Get at the scalar def before the loop, that defines the initial value
4114          of the reduction variable.  */
4115       gimple *def_stmt = SSA_NAME_DEF_STMT (reduction_op);
4116       tree op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
4117       vec_initial_defs.create (1);
4118       vec_initial_def = get_initial_def_for_reduction (stmt, op,
4119                                                        &adjustment_def);
4120       vec_initial_defs.quick_push (vec_initial_def);
4121     }
4122
4123   /* Set phi nodes arguments.  */
4124   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4125     {
4126       tree vec_init_def, def;
4127       gimple_seq stmts;
4128       vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4129                                            true, NULL_TREE);
4130       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4131       def = vect_defs[i];
4132       for (j = 0; j < ncopies; j++)
4133         {
4134           /* Set the loop-entry arg of the reduction-phi.  */
4135           add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4136                        loop_preheader_edge (loop), UNKNOWN_LOCATION);
4137
4138           /* Set the loop-latch arg for the reduction-phi.  */
4139           if (j > 0)
4140             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4141
4142           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4143                        UNKNOWN_LOCATION);
4144
4145           if (dump_enabled_p ())
4146             {
4147               dump_printf_loc (MSG_NOTE, vect_location,
4148                                "transform reduction: created def-use cycle: ");
4149               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4150               dump_printf (MSG_NOTE, "\n");
4151               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4152               dump_printf (MSG_NOTE, "\n");
4153             }
4154
4155           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4156         }
4157     }
4158
4159   /* 2. Create epilog code.
4160         The reduction epilog code operates across the elements of the vector
4161         of partial results computed by the vectorized loop.
4162         The reduction epilog code consists of:
4163
4164         step 1: compute the scalar result in a vector (v_out2)
4165         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4166         step 3: adjust the scalar result (s_out3) if needed.
4167
4168         Step 1 can be accomplished using one the following three schemes:
4169           (scheme 1) using reduc_code, if available.
4170           (scheme 2) using whole-vector shifts, if available.
4171           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4172                      combined.
4173
4174           The overall epilog code looks like this:
4175
4176           s_out0 = phi <s_loop>         # original EXIT_PHI
4177           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4178           v_out2 = reduce <v_out1>              # step 1
4179           s_out3 = extract_field <v_out2, 0>    # step 2
4180           s_out4 = adjust_result <s_out3>       # step 3
4181
4182           (step 3 is optional, and steps 1 and 2 may be combined).
4183           Lastly, the uses of s_out0 are replaced by s_out4.  */
4184
4185
4186   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4187          v_out1 = phi <VECT_DEF>
4188          Store them in NEW_PHIS.  */
4189
4190   exit_bb = single_exit (loop)->dest;
4191   prev_phi_info = NULL;
4192   new_phis.create (vect_defs.length ());
4193   FOR_EACH_VEC_ELT (vect_defs, i, def)
4194     {
4195       for (j = 0; j < ncopies; j++)
4196         {
4197           tree new_def = copy_ssa_name (def);
4198           phi = create_phi_node (new_def, exit_bb);
4199           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4200           if (j == 0)
4201             new_phis.quick_push (phi);
4202           else
4203             {
4204               def = vect_get_vec_def_for_stmt_copy (dt, def);
4205               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4206             }
4207
4208           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4209           prev_phi_info = vinfo_for_stmt (phi);
4210         }
4211     }
4212
4213   /* The epilogue is created for the outer-loop, i.e., for the loop being
4214      vectorized.  Create exit phis for the outer loop.  */
4215   if (double_reduc)
4216     {
4217       loop = outer_loop;
4218       exit_bb = single_exit (loop)->dest;
4219       inner_phis.create (vect_defs.length ());
4220       FOR_EACH_VEC_ELT (new_phis, i, phi)
4221         {
4222           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4223           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4224           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4225                            PHI_RESULT (phi));
4226           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4227                                                             loop_vinfo));
4228           inner_phis.quick_push (phi);
4229           new_phis[i] = outer_phi;
4230           prev_phi_info = vinfo_for_stmt (outer_phi);
4231           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4232             {
4233               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4234               new_result = copy_ssa_name (PHI_RESULT (phi));
4235               outer_phi = create_phi_node (new_result, exit_bb);
4236               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4237                                PHI_RESULT (phi));
4238               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4239                                                                 loop_vinfo));
4240               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4241               prev_phi_info = vinfo_for_stmt (outer_phi);
4242             }
4243         }
4244     }
4245
4246   exit_gsi = gsi_after_labels (exit_bb);
4247
4248   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4249          (i.e. when reduc_code is not available) and in the final adjustment
4250          code (if needed).  Also get the original scalar reduction variable as
4251          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4252          represents a reduction pattern), the tree-code and scalar-def are
4253          taken from the original stmt that the pattern-stmt (STMT) replaces.
4254          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4255          are taken from STMT.  */
4256
4257   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4258   if (!orig_stmt)
4259     {
4260       /* Regular reduction  */
4261       orig_stmt = stmt;
4262     }
4263   else
4264     {
4265       /* Reduction pattern  */
4266       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4267       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4268       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4269     }
4270
4271   code = gimple_assign_rhs_code (orig_stmt);
4272   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4273      partial results are added and not subtracted.  */
4274   if (code == MINUS_EXPR)
4275     code = PLUS_EXPR;
4276
4277   scalar_dest = gimple_assign_lhs (orig_stmt);
4278   scalar_type = TREE_TYPE (scalar_dest);
4279   scalar_results.create (group_size);
4280   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4281   bitsize = TYPE_SIZE (scalar_type);
4282
4283   /* In case this is a reduction in an inner-loop while vectorizing an outer
4284      loop - we don't need to extract a single scalar result at the end of the
4285      inner-loop (unless it is double reduction, i.e., the use of reduction is
4286      outside the outer-loop).  The final vector of partial results will be used
4287      in the vectorized outer-loop, or reduced to a scalar result at the end of
4288      the outer-loop.  */
4289   if (nested_in_vect_loop && !double_reduc)
4290     goto vect_finalize_reduction;
4291
4292   /* SLP reduction without reduction chain, e.g.,
4293      # a1 = phi <a2, a0>
4294      # b1 = phi <b2, b0>
4295      a2 = operation (a1)
4296      b2 = operation (b1)  */
4297   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4298
4299   /* In case of reduction chain, e.g.,
4300      # a1 = phi <a3, a0>
4301      a2 = operation (a1)
4302      a3 = operation (a2),
4303
4304      we may end up with more than one vector result.  Here we reduce them to
4305      one vector.  */
4306   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4307     {
4308       tree first_vect = PHI_RESULT (new_phis[0]);
4309       tree tmp;
4310       gassign *new_vec_stmt = NULL;
4311
4312       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4313       for (k = 1; k < new_phis.length (); k++)
4314         {
4315           gimple *next_phi = new_phis[k];
4316           tree second_vect = PHI_RESULT (next_phi);
4317
4318           tmp = build2 (code, vectype,  first_vect, second_vect);
4319           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4320           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4321           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4322           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4323         }
4324
4325       new_phi_result = first_vect;
4326       if (new_vec_stmt)
4327         {
4328           new_phis.truncate (0);
4329           new_phis.safe_push (new_vec_stmt);
4330         }
4331     }
4332   else
4333     new_phi_result = PHI_RESULT (new_phis[0]);
4334
4335   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4336     {
4337       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4338          various data values where the condition matched and another vector
4339          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4340          need to extract the last matching index (which will be the index with
4341          highest value) and use this to index into the data vector.
4342          For the case where there were no matches, the data vector will contain
4343          all default values and the index vector will be all zeros.  */
4344
4345       /* Get various versions of the type of the vector of indexes.  */
4346       tree index_vec_type = TREE_TYPE (induction_index);
4347       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4348       tree index_scalar_type = TREE_TYPE (index_vec_type);
4349       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4350         (index_vec_type);
4351
4352       /* Get an unsigned integer version of the type of the data vector.  */
4353       int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
4354       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4355       tree vectype_unsigned = build_vector_type
4356         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4357
4358       /* First we need to create a vector (ZERO_VEC) of zeros and another
4359          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4360          can create using a MAX reduction and then expanding.
4361          In the case where the loop never made any matches, the max index will
4362          be zero.  */
4363
4364       /* Vector of {0, 0, 0,...}.  */
4365       tree zero_vec = make_ssa_name (vectype);
4366       tree zero_vec_rhs = build_zero_cst (vectype);
4367       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4368       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4369
4370       /* Find maximum value from the vector of found indexes.  */
4371       tree max_index = make_ssa_name (index_scalar_type);
4372       gimple *max_index_stmt = gimple_build_assign (max_index, REDUC_MAX_EXPR,
4373                                                     induction_index);
4374       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4375
4376       /* Vector of {max_index, max_index, max_index,...}.  */
4377       tree max_index_vec = make_ssa_name (index_vec_type);
4378       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4379                                                       max_index);
4380       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4381                                                         max_index_vec_rhs);
4382       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4383
4384       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4385          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4386          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4387          otherwise.  Only one value should match, resulting in a vector
4388          (VEC_COND) with one data value and the rest zeros.
4389          In the case where the loop never made any matches, every index will
4390          match, resulting in a vector with all data values (which will all be
4391          the default value).  */
4392
4393       /* Compare the max index vector to the vector of found indexes to find
4394          the position of the max value.  */
4395       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4396       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4397                                                       induction_index,
4398                                                       max_index_vec);
4399       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4400
4401       /* Use the compare to choose either values from the data vector or
4402          zero.  */
4403       tree vec_cond = make_ssa_name (vectype);
4404       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4405                                                    vec_compare, new_phi_result,
4406                                                    zero_vec);
4407       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4408
4409       /* Finally we need to extract the data value from the vector (VEC_COND)
4410          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4411          reduction, but because this doesn't exist, we can use a MAX reduction
4412          instead.  The data value might be signed or a float so we need to cast
4413          it first.
4414          In the case where the loop never made any matches, the data values are
4415          all identical, and so will reduce down correctly.  */
4416
4417       /* Make the matched data values unsigned.  */
4418       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4419       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4420                                        vec_cond);
4421       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4422                                                         VIEW_CONVERT_EXPR,
4423                                                         vec_cond_cast_rhs);
4424       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4425
4426       /* Reduce down to a scalar value.  */
4427       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4428       optab ot = optab_for_tree_code (REDUC_MAX_EXPR, vectype_unsigned,
4429                                       optab_default);
4430       gcc_assert (optab_handler (ot, TYPE_MODE (vectype_unsigned))
4431                   != CODE_FOR_nothing);
4432       gimple *data_reduc_stmt = gimple_build_assign (data_reduc,
4433                                                      REDUC_MAX_EXPR,
4434                                                      vec_cond_cast);
4435       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4436
4437       /* Convert the reduced value back to the result type and set as the
4438          result.  */
4439       tree data_reduc_cast = build1 (VIEW_CONVERT_EXPR, scalar_type,
4440                                      data_reduc);
4441       epilog_stmt = gimple_build_assign (new_scalar_dest, data_reduc_cast);
4442       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4443       gimple_assign_set_lhs (epilog_stmt, new_temp);
4444       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4445       scalar_results.safe_push (new_temp);
4446     }
4447
4448   /* 2.3 Create the reduction code, using one of the three schemes described
4449          above. In SLP we simply need to extract all the elements from the
4450          vector (without reducing them), so we use scalar shifts.  */
4451   else if (reduc_code != ERROR_MARK && !slp_reduc)
4452     {
4453       tree tmp;
4454       tree vec_elem_type;
4455
4456       /*** Case 1:  Create:
4457            v_out2 = reduc_expr <v_out1>  */
4458
4459       if (dump_enabled_p ())
4460         dump_printf_loc (MSG_NOTE, vect_location,
4461                          "Reduce using direct vector reduction.\n");
4462
4463       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4464       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4465         {
4466           tree tmp_dest =
4467               vect_create_destination_var (scalar_dest, vec_elem_type);
4468           tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4469           epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4470           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4471           gimple_assign_set_lhs (epilog_stmt, new_temp);
4472           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4473
4474           tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4475         }
4476       else
4477         tmp = build1 (reduc_code, scalar_type, new_phi_result);
4478       epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4479       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4480       gimple_assign_set_lhs (epilog_stmt, new_temp);
4481       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4482       scalar_results.safe_push (new_temp);
4483     }
4484   else
4485     {
4486       bool reduce_with_shift = have_whole_vector_shift (mode);
4487       int element_bitsize = tree_to_uhwi (bitsize);
4488       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4489       tree vec_temp;
4490
4491       /* Regardless of whether we have a whole vector shift, if we're
4492          emulating the operation via tree-vect-generic, we don't want
4493          to use it.  Only the first round of the reduction is likely
4494          to still be profitable via emulation.  */
4495       /* ??? It might be better to emit a reduction tree code here, so that
4496          tree-vect-generic can expand the first round via bit tricks.  */
4497       if (!VECTOR_MODE_P (mode))
4498         reduce_with_shift = false;
4499       else
4500         {
4501           optab optab = optab_for_tree_code (code, vectype, optab_default);
4502           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4503             reduce_with_shift = false;
4504         }
4505
4506       if (reduce_with_shift && !slp_reduc)
4507         {
4508           int nelements = vec_size_in_bits / element_bitsize;
4509           unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4510
4511           int elt_offset;
4512
4513           tree zero_vec = build_zero_cst (vectype);
4514           /*** Case 2: Create:
4515              for (offset = nelements/2; offset >= 1; offset/=2)
4516                 {
4517                   Create:  va' = vec_shift <va, offset>
4518                   Create:  va = vop <va, va'>
4519                 }  */
4520
4521           tree rhs;
4522
4523           if (dump_enabled_p ())
4524             dump_printf_loc (MSG_NOTE, vect_location,
4525                              "Reduce using vector shifts\n");
4526
4527           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4528           new_temp = new_phi_result;
4529           for (elt_offset = nelements / 2;
4530                elt_offset >= 1;
4531                elt_offset /= 2)
4532             {
4533               calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4534               tree mask = vect_gen_perm_mask_any (vectype, sel);
4535               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4536                                                  new_temp, zero_vec, mask);
4537               new_name = make_ssa_name (vec_dest, epilog_stmt);
4538               gimple_assign_set_lhs (epilog_stmt, new_name);
4539               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4540
4541               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4542                                                  new_temp);
4543               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4544               gimple_assign_set_lhs (epilog_stmt, new_temp);
4545               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4546             }
4547
4548           /* 2.4  Extract the final scalar result.  Create:
4549              s_out3 = extract_field <v_out2, bitpos>  */
4550
4551           if (dump_enabled_p ())
4552             dump_printf_loc (MSG_NOTE, vect_location,
4553                              "extract scalar result\n");
4554
4555           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4556                         bitsize, bitsize_zero_node);
4557           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4558           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4559           gimple_assign_set_lhs (epilog_stmt, new_temp);
4560           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4561           scalar_results.safe_push (new_temp);
4562         }
4563       else
4564         {
4565           /*** Case 3: Create:
4566              s = extract_field <v_out2, 0>
4567              for (offset = element_size;
4568                   offset < vector_size;
4569                   offset += element_size;)
4570                {
4571                  Create:  s' = extract_field <v_out2, offset>
4572                  Create:  s = op <s, s'>  // For non SLP cases
4573                }  */
4574
4575           if (dump_enabled_p ())
4576             dump_printf_loc (MSG_NOTE, vect_location,
4577                              "Reduce using scalar code.\n");
4578
4579           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4580           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4581             {
4582               int bit_offset;
4583               if (gimple_code (new_phi) == GIMPLE_PHI)
4584                 vec_temp = PHI_RESULT (new_phi);
4585               else
4586                 vec_temp = gimple_assign_lhs (new_phi);
4587               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4588                             bitsize_zero_node);
4589               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4590               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4591               gimple_assign_set_lhs (epilog_stmt, new_temp);
4592               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4593
4594               /* In SLP we don't need to apply reduction operation, so we just
4595                  collect s' values in SCALAR_RESULTS.  */
4596               if (slp_reduc)
4597                 scalar_results.safe_push (new_temp);
4598
4599               for (bit_offset = element_bitsize;
4600                    bit_offset < vec_size_in_bits;
4601                    bit_offset += element_bitsize)
4602                 {
4603                   tree bitpos = bitsize_int (bit_offset);
4604                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4605                                      bitsize, bitpos);
4606
4607                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4608                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4609                   gimple_assign_set_lhs (epilog_stmt, new_name);
4610                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4611
4612                   if (slp_reduc)
4613                     {
4614                       /* In SLP we don't need to apply reduction operation, so
4615                          we just collect s' values in SCALAR_RESULTS.  */
4616                       new_temp = new_name;
4617                       scalar_results.safe_push (new_name);
4618                     }
4619                   else
4620                     {
4621                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4622                                                          new_name, new_temp);
4623                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4624                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4625                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4626                     }
4627                 }
4628             }
4629
4630           /* The only case where we need to reduce scalar results in SLP, is
4631              unrolling.  If the size of SCALAR_RESULTS is greater than
4632              GROUP_SIZE, we reduce them combining elements modulo
4633              GROUP_SIZE.  */
4634           if (slp_reduc)
4635             {
4636               tree res, first_res, new_res;
4637               gimple *new_stmt;
4638
4639               /* Reduce multiple scalar results in case of SLP unrolling.  */
4640               for (j = group_size; scalar_results.iterate (j, &res);
4641                    j++)
4642                 {
4643                   first_res = scalar_results[j % group_size];
4644                   new_stmt = gimple_build_assign (new_scalar_dest, code,
4645                                                   first_res, res);
4646                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4647                   gimple_assign_set_lhs (new_stmt, new_res);
4648                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4649                   scalar_results[j % group_size] = new_res;
4650                 }
4651             }
4652           else
4653             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4654             scalar_results.safe_push (new_temp);
4655         }
4656     }
4657
4658 vect_finalize_reduction:
4659
4660   if (double_reduc)
4661     loop = loop->inner;
4662
4663   /* 2.5 Adjust the final result by the initial value of the reduction
4664          variable. (When such adjustment is not needed, then
4665          'adjustment_def' is zero).  For example, if code is PLUS we create:
4666          new_temp = loop_exit_def + adjustment_def  */
4667
4668   if (adjustment_def)
4669     {
4670       gcc_assert (!slp_reduc);
4671       if (nested_in_vect_loop)
4672         {
4673           new_phi = new_phis[0];
4674           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4675           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4676           new_dest = vect_create_destination_var (scalar_dest, vectype);
4677         }
4678       else
4679         {
4680           new_temp = scalar_results[0];
4681           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4682           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4683           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4684         }
4685
4686       epilog_stmt = gimple_build_assign (new_dest, expr);
4687       new_temp = make_ssa_name (new_dest, epilog_stmt);
4688       gimple_assign_set_lhs (epilog_stmt, new_temp);
4689       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4690       if (nested_in_vect_loop)
4691         {
4692           set_vinfo_for_stmt (epilog_stmt,
4693                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
4694           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4695                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4696
4697           if (!double_reduc)
4698             scalar_results.quick_push (new_temp);
4699           else
4700             scalar_results[0] = new_temp;
4701         }
4702       else
4703         scalar_results[0] = new_temp;
4704
4705       new_phis[0] = epilog_stmt;
4706     }
4707
4708   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4709           phis with new adjusted scalar results, i.e., replace use <s_out0>
4710           with use <s_out4>.
4711
4712      Transform:
4713         loop_exit:
4714           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4715           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4716           v_out2 = reduce <v_out1>
4717           s_out3 = extract_field <v_out2, 0>
4718           s_out4 = adjust_result <s_out3>
4719           use <s_out0>
4720           use <s_out0>
4721
4722      into:
4723
4724         loop_exit:
4725           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4726           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4727           v_out2 = reduce <v_out1>
4728           s_out3 = extract_field <v_out2, 0>
4729           s_out4 = adjust_result <s_out3>
4730           use <s_out4>
4731           use <s_out4> */
4732
4733
4734   /* In SLP reduction chain we reduce vector results into one vector if
4735      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4736      the last stmt in the reduction chain, since we are looking for the loop
4737      exit phi node.  */
4738   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4739     {
4740       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
4741       /* Handle reduction patterns.  */
4742       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
4743         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
4744
4745       scalar_dest = gimple_assign_lhs (dest_stmt);
4746       group_size = 1;
4747     }
4748
4749   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4750      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4751      need to match SCALAR_RESULTS with corresponding statements.  The first
4752      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4753      the first vector stmt, etc.
4754      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4755   if (group_size > new_phis.length ())
4756     {
4757       ratio = group_size / new_phis.length ();
4758       gcc_assert (!(group_size % new_phis.length ()));
4759     }
4760   else
4761     ratio = 1;
4762
4763   for (k = 0; k < group_size; k++)
4764     {
4765       if (k % ratio == 0)
4766         {
4767           epilog_stmt = new_phis[k / ratio];
4768           reduction_phi = reduction_phis[k / ratio];
4769           if (double_reduc)
4770             inner_phi = inner_phis[k / ratio];
4771         }
4772
4773       if (slp_reduc)
4774         {
4775           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4776
4777           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4778           /* SLP statements can't participate in patterns.  */
4779           gcc_assert (!orig_stmt);
4780           scalar_dest = gimple_assign_lhs (current_stmt);
4781         }
4782
4783       phis.create (3);
4784       /* Find the loop-closed-use at the loop exit of the original scalar
4785          result.  (The reduction result is expected to have two immediate uses -
4786          one at the latch block, and one at the loop exit).  */
4787       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4788         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4789             && !is_gimple_debug (USE_STMT (use_p)))
4790           phis.safe_push (USE_STMT (use_p));
4791
4792       /* While we expect to have found an exit_phi because of loop-closed-ssa
4793          form we can end up without one if the scalar cycle is dead.  */
4794
4795       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4796         {
4797           if (outer_loop)
4798             {
4799               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4800               gphi *vect_phi;
4801
4802               /* FORNOW. Currently not supporting the case that an inner-loop
4803                  reduction is not used in the outer-loop (but only outside the
4804                  outer-loop), unless it is double reduction.  */
4805               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4806                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4807                           || double_reduc);
4808
4809               if (double_reduc)
4810                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
4811               else
4812                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4813               if (!double_reduc
4814                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4815                       != vect_double_reduction_def)
4816                 continue;
4817
4818               /* Handle double reduction:
4819
4820                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4821                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4822                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4823                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4824
4825                  At that point the regular reduction (stmt2 and stmt3) is
4826                  already vectorized, as well as the exit phi node, stmt4.
4827                  Here we vectorize the phi node of double reduction, stmt1, and
4828                  update all relevant statements.  */
4829
4830               /* Go through all the uses of s2 to find double reduction phi
4831                  node, i.e., stmt1 above.  */
4832               orig_name = PHI_RESULT (exit_phi);
4833               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4834                 {
4835                   stmt_vec_info use_stmt_vinfo;
4836                   stmt_vec_info new_phi_vinfo;
4837                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4838                   basic_block bb = gimple_bb (use_stmt);
4839                   gimple *use;
4840
4841                   /* Check that USE_STMT is really double reduction phi
4842                      node.  */
4843                   if (gimple_code (use_stmt) != GIMPLE_PHI
4844                       || gimple_phi_num_args (use_stmt) != 2
4845                       || bb->loop_father != outer_loop)
4846                     continue;
4847                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4848                   if (!use_stmt_vinfo
4849                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4850                           != vect_double_reduction_def)
4851                     continue;
4852
4853                   /* Create vector phi node for double reduction:
4854                      vs1 = phi <vs0, vs2>
4855                      vs1 was created previously in this function by a call to
4856                        vect_get_vec_def_for_operand and is stored in
4857                        vec_initial_def;
4858                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4859                      vs0 is created here.  */
4860
4861                   /* Create vector phi node.  */
4862                   vect_phi = create_phi_node (vec_initial_def, bb);
4863                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4864                                     loop_vec_info_for_loop (outer_loop));
4865                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4866
4867                   /* Create vs0 - initial def of the double reduction phi.  */
4868                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4869                                              loop_preheader_edge (outer_loop));
4870                   init_def = get_initial_def_for_reduction (stmt,
4871                                                           preheader_arg, NULL);
4872                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4873                                                     vectype, NULL);
4874
4875                   /* Update phi node arguments with vs0 and vs2.  */
4876                   add_phi_arg (vect_phi, vect_phi_init,
4877                                loop_preheader_edge (outer_loop),
4878                                UNKNOWN_LOCATION);
4879                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4880                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4881                   if (dump_enabled_p ())
4882                     {
4883                       dump_printf_loc (MSG_NOTE, vect_location,
4884                                        "created double reduction phi node: ");
4885                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4886                       dump_printf (MSG_NOTE, "\n");
4887                     }
4888
4889                   vect_phi_res = PHI_RESULT (vect_phi);
4890
4891                   /* Replace the use, i.e., set the correct vs1 in the regular
4892                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4893                      loop is redundant.  */
4894                   use = reduction_phi;
4895                   for (j = 0; j < ncopies; j++)
4896                     {
4897                       edge pr_edge = loop_preheader_edge (loop);
4898                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4899                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4900                     }
4901                 }
4902             }
4903         }
4904
4905       phis.release ();
4906       if (nested_in_vect_loop)
4907         {
4908           if (double_reduc)
4909             loop = outer_loop;
4910           else
4911             continue;
4912         }
4913
4914       phis.create (3);
4915       /* Find the loop-closed-use at the loop exit of the original scalar
4916          result.  (The reduction result is expected to have two immediate uses,
4917          one at the latch block, and one at the loop exit).  For double
4918          reductions we are looking for exit phis of the outer loop.  */
4919       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4920         {
4921           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4922             {
4923               if (!is_gimple_debug (USE_STMT (use_p)))
4924                 phis.safe_push (USE_STMT (use_p));
4925             }
4926           else
4927             {
4928               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4929                 {
4930                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4931
4932                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4933                     {
4934                       if (!flow_bb_inside_loop_p (loop,
4935                                              gimple_bb (USE_STMT (phi_use_p)))
4936                           && !is_gimple_debug (USE_STMT (phi_use_p)))
4937                         phis.safe_push (USE_STMT (phi_use_p));
4938                     }
4939                 }
4940             }
4941         }
4942
4943       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4944         {
4945           /* Replace the uses:  */
4946           orig_name = PHI_RESULT (exit_phi);
4947           scalar_result = scalar_results[k];
4948           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4949             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4950               SET_USE (use_p, scalar_result);
4951         }
4952
4953       phis.release ();
4954     }
4955 }
4956
4957
4958 /* Function vectorizable_reduction.
4959
4960    Check if STMT performs a reduction operation that can be vectorized.
4961    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4962    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4963    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4964
4965    This function also handles reduction idioms (patterns) that have been
4966    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4967    of this form:
4968      X = pattern_expr (arg0, arg1, ..., X)
4969    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4970    sequence that had been detected and replaced by the pattern-stmt (STMT).
4971
4972    This function also handles reduction of condition expressions, for example:
4973      for (int i = 0; i < N; i++)
4974        if (a[i] < value)
4975          last = a[i];
4976    This is handled by vectorising the loop and creating an additional vector
4977    containing the loop indexes for which "a[i] < value" was true.  In the
4978    function epilogue this is reduced to a single max value and then used to
4979    index into the vector of results.
4980
4981    In some cases of reduction patterns, the type of the reduction variable X is
4982    different than the type of the other arguments of STMT.
4983    In such cases, the vectype that is used when transforming STMT into a vector
4984    stmt is different than the vectype that is used to determine the
4985    vectorization factor, because it consists of a different number of elements
4986    than the actual number of elements that are being operated upon in parallel.
4987
4988    For example, consider an accumulation of shorts into an int accumulator.
4989    On some targets it's possible to vectorize this pattern operating on 8
4990    shorts at a time (hence, the vectype for purposes of determining the
4991    vectorization factor should be V8HI); on the other hand, the vectype that
4992    is used to create the vector form is actually V4SI (the type of the result).
4993
4994    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4995    indicates what is the actual level of parallelism (V8HI in the example), so
4996    that the right vectorization factor would be derived.  This vectype
4997    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4998    be used to create the vectorized stmt.  The right vectype for the vectorized
4999    stmt is obtained from the type of the result X:
5000         get_vectype_for_scalar_type (TREE_TYPE (X))
5001
5002    This means that, contrary to "regular" reductions (or "regular" stmts in
5003    general), the following equation:
5004       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5005    does *NOT* necessarily hold for reduction patterns.  */
5006
5007 bool
5008 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5009                         gimple **vec_stmt, slp_tree slp_node)
5010 {
5011   tree vec_dest;
5012   tree scalar_dest;
5013   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
5014   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5015   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5016   tree vectype_in = NULL_TREE;
5017   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5018   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5019   enum tree_code code, orig_code, epilog_reduc_code;
5020   machine_mode vec_mode;
5021   int op_type;
5022   optab optab, reduc_optab;
5023   tree new_temp = NULL_TREE;
5024   gimple *def_stmt;
5025   enum vect_def_type dt;
5026   gphi *new_phi = NULL;
5027   tree scalar_type;
5028   bool is_simple_use;
5029   gimple *orig_stmt;
5030   stmt_vec_info orig_stmt_info;
5031   tree expr = NULL_TREE;
5032   int i;
5033   int ncopies;
5034   int epilog_copies;
5035   stmt_vec_info prev_stmt_info, prev_phi_info;
5036   bool single_defuse_cycle = false;
5037   tree reduc_def = NULL_TREE;
5038   gimple *new_stmt = NULL;
5039   int j;
5040   tree ops[3];
5041   bool nested_cycle = false, found_nested_cycle_def = false;
5042   gimple *reduc_def_stmt = NULL;
5043   bool double_reduc = false, dummy;
5044   basic_block def_bb;
5045   struct loop * def_stmt_loop, *outer_loop = NULL;
5046   tree def_arg;
5047   gimple *def_arg_stmt;
5048   auto_vec<tree> vec_oprnds0;
5049   auto_vec<tree> vec_oprnds1;
5050   auto_vec<tree> vect_defs;
5051   auto_vec<gimple *> phis;
5052   int vec_num;
5053   tree def0, def1, tem, op0, op1 = NULL_TREE;
5054   bool first_p = true;
5055   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5056
5057   /* In case of reduction chain we switch to the first stmt in the chain, but
5058      we don't update STMT_INFO, since only the last stmt is marked as reduction
5059      and has reduction properties.  */
5060   if (GROUP_FIRST_ELEMENT (stmt_info)
5061       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5062     {
5063       stmt = GROUP_FIRST_ELEMENT (stmt_info);
5064       first_p = false;
5065     }
5066
5067   if (nested_in_vect_loop_p (loop, stmt))
5068     {
5069       outer_loop = loop;
5070       loop = loop->inner;
5071       nested_cycle = true;
5072     }
5073
5074   /* 1. Is vectorizable reduction?  */
5075   /* Not supportable if the reduction variable is used in the loop, unless
5076      it's a reduction chain.  */
5077   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5078       && !GROUP_FIRST_ELEMENT (stmt_info))
5079     return false;
5080
5081   /* Reductions that are not used even in an enclosing outer-loop,
5082      are expected to be "live" (used out of the loop).  */
5083   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5084       && !STMT_VINFO_LIVE_P (stmt_info))
5085     return false;
5086
5087   /* Make sure it was already recognized as a reduction computation.  */
5088   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5089       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5090     return false;
5091
5092   /* 2. Has this been recognized as a reduction pattern?
5093
5094      Check if STMT represents a pattern that has been recognized
5095      in earlier analysis stages.  For stmts that represent a pattern,
5096      the STMT_VINFO_RELATED_STMT field records the last stmt in
5097      the original sequence that constitutes the pattern.  */
5098
5099   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5100   if (orig_stmt)
5101     {
5102       orig_stmt_info = vinfo_for_stmt (orig_stmt);
5103       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5104       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5105     }
5106
5107   /* 3. Check the operands of the operation.  The first operands are defined
5108         inside the loop body. The last operand is the reduction variable,
5109         which is defined by the loop-header-phi.  */
5110
5111   gcc_assert (is_gimple_assign (stmt));
5112
5113   /* Flatten RHS.  */
5114   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5115     {
5116     case GIMPLE_SINGLE_RHS:
5117       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
5118       if (op_type == ternary_op)
5119         {
5120           tree rhs = gimple_assign_rhs1 (stmt);
5121           ops[0] = TREE_OPERAND (rhs, 0);
5122           ops[1] = TREE_OPERAND (rhs, 1);
5123           ops[2] = TREE_OPERAND (rhs, 2);
5124           code = TREE_CODE (rhs);
5125         }
5126       else
5127         return false;
5128       break;
5129
5130     case GIMPLE_BINARY_RHS:
5131       code = gimple_assign_rhs_code (stmt);
5132       op_type = TREE_CODE_LENGTH (code);
5133       gcc_assert (op_type == binary_op);
5134       ops[0] = gimple_assign_rhs1 (stmt);
5135       ops[1] = gimple_assign_rhs2 (stmt);
5136       break;
5137
5138     case GIMPLE_TERNARY_RHS:
5139       code = gimple_assign_rhs_code (stmt);
5140       op_type = TREE_CODE_LENGTH (code);
5141       gcc_assert (op_type == ternary_op);
5142       ops[0] = gimple_assign_rhs1 (stmt);
5143       ops[1] = gimple_assign_rhs2 (stmt);
5144       ops[2] = gimple_assign_rhs3 (stmt);
5145       break;
5146
5147     case GIMPLE_UNARY_RHS:
5148       return false;
5149
5150     default:
5151       gcc_unreachable ();
5152     }
5153   /* The default is that the reduction variable is the last in statement.  */
5154   int reduc_index = op_type - 1;
5155
5156   if (code == COND_EXPR && slp_node)
5157     return false;
5158
5159   scalar_dest = gimple_assign_lhs (stmt);
5160   scalar_type = TREE_TYPE (scalar_dest);
5161   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5162       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5163     return false;
5164
5165   /* Do not try to vectorize bit-precision reductions.  */
5166   if ((TYPE_PRECISION (scalar_type)
5167        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
5168     return false;
5169
5170   /* All uses but the last are expected to be defined in the loop.
5171      The last use is the reduction variable.  In case of nested cycle this
5172      assumption is not true: we use reduc_index to record the index of the
5173      reduction variable.  */
5174   for (i = 0; i < op_type - 1; i++)
5175     {
5176       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5177       if (i == 0 && code == COND_EXPR)
5178         continue;
5179
5180       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5181                                           &def_stmt, &dt, &tem);
5182       if (!vectype_in)
5183         vectype_in = tem;
5184       gcc_assert (is_simple_use);
5185
5186       if (dt != vect_internal_def
5187           && dt != vect_external_def
5188           && dt != vect_constant_def
5189           && dt != vect_induction_def
5190           && !(dt == vect_nested_cycle && nested_cycle))
5191         return false;
5192
5193       if (dt == vect_nested_cycle)
5194         {
5195           found_nested_cycle_def = true;
5196           reduc_def_stmt = def_stmt;
5197           reduc_index = i;
5198         }
5199     }
5200
5201   is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &def_stmt, &dt, &tem);
5202   if (!vectype_in)
5203     vectype_in = tem;
5204   gcc_assert (is_simple_use);
5205   if (!found_nested_cycle_def)
5206     reduc_def_stmt = def_stmt;
5207
5208   if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5209     return false;
5210
5211   if (!(dt == vect_reduction_def
5212         || dt == vect_nested_cycle
5213         || ((dt == vect_internal_def || dt == vect_external_def
5214              || dt == vect_constant_def || dt == vect_induction_def)
5215             && nested_cycle && found_nested_cycle_def)))
5216     {
5217       /* For pattern recognized stmts, orig_stmt might be a reduction,
5218          but some helper statements for the pattern might not, or
5219          might be COND_EXPRs with reduction uses in the condition.  */
5220       gcc_assert (orig_stmt);
5221       return false;
5222     }
5223
5224   gimple *tmp = vect_is_simple_reduction
5225                   (loop_vinfo, reduc_def_stmt,
5226                   !nested_cycle, &dummy, false,
5227                   &STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info));
5228   if (orig_stmt)
5229     gcc_assert (tmp == orig_stmt
5230                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5231   else
5232     /* We changed STMT to be the first stmt in reduction chain, hence we
5233        check that in this case the first element in the chain is STMT.  */
5234     gcc_assert (stmt == tmp
5235                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5236
5237   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5238     return false;
5239
5240   if (slp_node || PURE_SLP_STMT (stmt_info))
5241     ncopies = 1;
5242   else
5243     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5244                / TYPE_VECTOR_SUBPARTS (vectype_in));
5245
5246   gcc_assert (ncopies >= 1);
5247
5248   vec_mode = TYPE_MODE (vectype_in);
5249
5250   if (code == COND_EXPR)
5251     {
5252       /* Only call during the analysis stage, otherwise we'll lose
5253          STMT_VINFO_TYPE.  */
5254       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
5255                                                 ops[reduc_index], 0, NULL))
5256         {
5257           if (dump_enabled_p ())
5258             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5259                              "unsupported condition in reduction\n");
5260           return false;
5261         }
5262     }
5263   else
5264     {
5265       /* 4. Supportable by target?  */
5266
5267       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5268           || code == LROTATE_EXPR || code == RROTATE_EXPR)
5269         {
5270           /* Shifts and rotates are only supported by vectorizable_shifts,
5271              not vectorizable_reduction.  */
5272           if (dump_enabled_p ())
5273             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5274                              "unsupported shift or rotation.\n");
5275           return false;
5276         }
5277
5278       /* 4.1. check support for the operation in the loop  */
5279       optab = optab_for_tree_code (code, vectype_in, optab_default);
5280       if (!optab)
5281         {
5282           if (dump_enabled_p ())
5283             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5284                              "no optab.\n");
5285
5286           return false;
5287         }
5288
5289       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5290         {
5291           if (dump_enabled_p ())
5292             dump_printf (MSG_NOTE, "op not supported by target.\n");
5293
5294           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5295               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5296                   < vect_min_worthwhile_factor (code))
5297             return false;
5298
5299           if (dump_enabled_p ())
5300             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5301         }
5302
5303       /* Worthwhile without SIMD support?  */
5304       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5305           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5306              < vect_min_worthwhile_factor (code))
5307         {
5308           if (dump_enabled_p ())
5309             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5310                              "not worthwhile without SIMD support.\n");
5311
5312           return false;
5313         }
5314     }
5315
5316   /* 4.2. Check support for the epilog operation.
5317
5318           If STMT represents a reduction pattern, then the type of the
5319           reduction variable may be different than the type of the rest
5320           of the arguments.  For example, consider the case of accumulation
5321           of shorts into an int accumulator; The original code:
5322                         S1: int_a = (int) short_a;
5323           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
5324
5325           was replaced with:
5326                         STMT: int_acc = widen_sum <short_a, int_acc>
5327
5328           This means that:
5329           1. The tree-code that is used to create the vector operation in the
5330              epilog code (that reduces the partial results) is not the
5331              tree-code of STMT, but is rather the tree-code of the original
5332              stmt from the pattern that STMT is replacing.  I.e, in the example
5333              above we want to use 'widen_sum' in the loop, but 'plus' in the
5334              epilog.
5335           2. The type (mode) we use to check available target support
5336              for the vector operation to be created in the *epilog*, is
5337              determined by the type of the reduction variable (in the example
5338              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5339              However the type (mode) we use to check available target support
5340              for the vector operation to be created *inside the loop*, is
5341              determined by the type of the other arguments to STMT (in the
5342              example we'd check this: optab_handler (widen_sum_optab,
5343              vect_short_mode)).
5344
5345           This is contrary to "regular" reductions, in which the types of all
5346           the arguments are the same as the type of the reduction variable.
5347           For "regular" reductions we can therefore use the same vector type
5348           (and also the same tree-code) when generating the epilog code and
5349           when generating the code inside the loop.  */
5350
5351   if (orig_stmt)
5352     {
5353       /* This is a reduction pattern: get the vectype from the type of the
5354          reduction variable, and get the tree-code from orig_stmt.  */
5355       orig_code = gimple_assign_rhs_code (orig_stmt);
5356       gcc_assert (vectype_out);
5357       vec_mode = TYPE_MODE (vectype_out);
5358     }
5359   else
5360     {
5361       /* Regular reduction: use the same vectype and tree-code as used for
5362          the vector code inside the loop can be used for the epilog code. */
5363       orig_code = code;
5364     }
5365
5366   if (nested_cycle)
5367     {
5368       def_bb = gimple_bb (reduc_def_stmt);
5369       def_stmt_loop = def_bb->loop_father;
5370       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5371                                        loop_preheader_edge (def_stmt_loop));
5372       if (TREE_CODE (def_arg) == SSA_NAME
5373           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5374           && gimple_code (def_arg_stmt) == GIMPLE_PHI
5375           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5376           && vinfo_for_stmt (def_arg_stmt)
5377           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5378               == vect_double_reduction_def)
5379         double_reduc = true;
5380     }
5381
5382   epilog_reduc_code = ERROR_MARK;
5383
5384   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == TREE_CODE_REDUCTION)
5385     {
5386       if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5387         {
5388           reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5389                                          optab_default);
5390           if (!reduc_optab)
5391             {
5392               if (dump_enabled_p ())
5393                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5394                                  "no optab for reduction.\n");
5395
5396               epilog_reduc_code = ERROR_MARK;
5397             }
5398           else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5399             {
5400               optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5401               if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5402                 {
5403                   if (dump_enabled_p ())
5404                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5405                                      "reduc op not supported by target.\n");
5406
5407                   epilog_reduc_code = ERROR_MARK;
5408                 }
5409             }
5410         }
5411       else
5412         {
5413           if (!nested_cycle || double_reduc)
5414             {
5415               if (dump_enabled_p ())
5416                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5417                                  "no reduc code for scalar code.\n");
5418
5419               return false;
5420             }
5421         }
5422     }
5423   else
5424     {
5425       int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
5426       cr_index_scalar_type = make_unsigned_type (scalar_precision);
5427       cr_index_vector_type = build_vector_type
5428         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
5429
5430       epilog_reduc_code = REDUC_MAX_EXPR;
5431       optab = optab_for_tree_code (REDUC_MAX_EXPR, cr_index_vector_type,
5432                                    optab_default);
5433       if (optab_handler (optab, TYPE_MODE (cr_index_vector_type))
5434           == CODE_FOR_nothing)
5435         {
5436           if (dump_enabled_p ())
5437             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5438                              "reduc max op not supported by target.\n");
5439           return false;
5440         }
5441     }
5442
5443   if ((double_reduc
5444        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
5445       && ncopies > 1)
5446     {
5447       if (dump_enabled_p ())
5448         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5449                          "multiple types in double reduction or condition "
5450                          "reduction.\n");
5451       return false;
5452     }
5453
5454   /* In case of widenning multiplication by a constant, we update the type
5455      of the constant to be the type of the other operand.  We check that the
5456      constant fits the type in the pattern recognition pass.  */
5457   if (code == DOT_PROD_EXPR
5458       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5459     {
5460       if (TREE_CODE (ops[0]) == INTEGER_CST)
5461         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5462       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5463         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5464       else
5465         {
5466           if (dump_enabled_p ())
5467             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5468                              "invalid types in dot-prod\n");
5469
5470           return false;
5471         }
5472     }
5473
5474   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
5475     {
5476       widest_int ni;
5477
5478       if (! max_loop_iterations (loop, &ni))
5479         {
5480           if (dump_enabled_p ())
5481             dump_printf_loc (MSG_NOTE, vect_location,
5482                              "loop count not known, cannot create cond "
5483                              "reduction.\n");
5484           return false;
5485         }
5486       /* Convert backedges to iterations.  */
5487       ni += 1;
5488
5489       /* The additional index will be the same type as the condition.  Check
5490          that the loop can fit into this less one (because we'll use up the
5491          zero slot for when there are no matches).  */
5492       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
5493       if (wi::geu_p (ni, wi::to_widest (max_index)))
5494         {
5495           if (dump_enabled_p ())
5496             dump_printf_loc (MSG_NOTE, vect_location,
5497                              "loop size is greater than data size.\n");
5498           return false;
5499         }
5500     }
5501
5502   if (!vec_stmt) /* transformation not required.  */
5503     {
5504       if (first_p
5505           && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5506                                          reduc_index))
5507         return false;
5508       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5509       return true;
5510     }
5511
5512   /** Transform.  **/
5513
5514   if (dump_enabled_p ())
5515     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5516
5517   /* FORNOW: Multiple types are not supported for condition.  */
5518   if (code == COND_EXPR)
5519     gcc_assert (ncopies == 1);
5520
5521   /* Create the destination vector  */
5522   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5523
5524   /* In case the vectorization factor (VF) is bigger than the number
5525      of elements that we can fit in a vectype (nunits), we have to generate
5526      more than one vector stmt - i.e - we need to "unroll" the
5527      vector stmt by a factor VF/nunits.  For more details see documentation
5528      in vectorizable_operation.  */
5529
5530   /* If the reduction is used in an outer loop we need to generate
5531      VF intermediate results, like so (e.g. for ncopies=2):
5532         r0 = phi (init, r0)
5533         r1 = phi (init, r1)
5534         r0 = x0 + r0;
5535         r1 = x1 + r1;
5536     (i.e. we generate VF results in 2 registers).
5537     In this case we have a separate def-use cycle for each copy, and therefore
5538     for each copy we get the vector def for the reduction variable from the
5539     respective phi node created for this copy.
5540
5541     Otherwise (the reduction is unused in the loop nest), we can combine
5542     together intermediate results, like so (e.g. for ncopies=2):
5543         r = phi (init, r)
5544         r = x0 + r;
5545         r = x1 + r;
5546    (i.e. we generate VF/2 results in a single register).
5547    In this case for each copy we get the vector def for the reduction variable
5548    from the vectorized reduction operation generated in the previous iteration.
5549   */
5550
5551   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5552     {
5553       single_defuse_cycle = true;
5554       epilog_copies = 1;
5555     }
5556   else
5557     epilog_copies = ncopies;
5558
5559   prev_stmt_info = NULL;
5560   prev_phi_info = NULL;
5561   if (slp_node)
5562     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5563   else
5564     {
5565       vec_num = 1;
5566       vec_oprnds0.create (1);
5567       if (op_type == ternary_op)
5568         vec_oprnds1.create (1);
5569     }
5570
5571   phis.create (vec_num);
5572   vect_defs.create (vec_num);
5573   if (!slp_node)
5574     vect_defs.quick_push (NULL_TREE);
5575
5576   for (j = 0; j < ncopies; j++)
5577     {
5578       if (j == 0 || !single_defuse_cycle)
5579         {
5580           for (i = 0; i < vec_num; i++)
5581             {
5582               /* Create the reduction-phi that defines the reduction
5583                  operand.  */
5584               new_phi = create_phi_node (vec_dest, loop->header);
5585               set_vinfo_for_stmt (new_phi,
5586                                   new_stmt_vec_info (new_phi, loop_vinfo));
5587                if (j == 0 || slp_node)
5588                  phis.quick_push (new_phi);
5589             }
5590         }
5591
5592       if (code == COND_EXPR)
5593         {
5594           gcc_assert (!slp_node);
5595           vectorizable_condition (stmt, gsi, vec_stmt,
5596                                   PHI_RESULT (phis[0]),
5597                                   reduc_index, NULL);
5598           /* Multiple types are not supported for condition.  */
5599           break;
5600         }
5601
5602       /* Handle uses.  */
5603       if (j == 0)
5604         {
5605           op0 = ops[!reduc_index];
5606           if (op_type == ternary_op)
5607             {
5608               if (reduc_index == 0)
5609                 op1 = ops[2];
5610               else
5611                 op1 = ops[1];
5612             }
5613
5614           if (slp_node)
5615             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5616                                slp_node, -1);
5617           else
5618             {
5619               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5620                                                             stmt);
5621               vec_oprnds0.quick_push (loop_vec_def0);
5622               if (op_type == ternary_op)
5623                {
5624                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt);
5625                  vec_oprnds1.quick_push (loop_vec_def1);
5626                }
5627             }
5628         }
5629       else
5630         {
5631           if (!slp_node)
5632             {
5633               enum vect_def_type dt;
5634               gimple *dummy_stmt;
5635
5636               vect_is_simple_use (ops[!reduc_index], loop_vinfo,
5637                                   &dummy_stmt, &dt);
5638               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5639                                                               loop_vec_def0);
5640               vec_oprnds0[0] = loop_vec_def0;
5641               if (op_type == ternary_op)
5642                 {
5643                   vect_is_simple_use (op1, loop_vinfo, &dummy_stmt, &dt);
5644                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5645                                                                 loop_vec_def1);
5646                   vec_oprnds1[0] = loop_vec_def1;
5647                 }
5648             }
5649
5650           if (single_defuse_cycle)
5651             reduc_def = gimple_assign_lhs (new_stmt);
5652
5653           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5654         }
5655
5656       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5657         {
5658           if (slp_node)
5659             reduc_def = PHI_RESULT (phis[i]);
5660           else
5661             {
5662               if (!single_defuse_cycle || j == 0)
5663                 reduc_def = PHI_RESULT (new_phi);
5664             }
5665
5666           def1 = ((op_type == ternary_op)
5667                   ? vec_oprnds1[i] : NULL);
5668           if (op_type == binary_op)
5669             {
5670               if (reduc_index == 0)
5671                 expr = build2 (code, vectype_out, reduc_def, def0);
5672               else
5673                 expr = build2 (code, vectype_out, def0, reduc_def);
5674             }
5675           else
5676             {
5677               if (reduc_index == 0)
5678                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5679               else
5680                 {
5681                   if (reduc_index == 1)
5682                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5683                   else
5684                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5685                 }
5686             }
5687
5688           new_stmt = gimple_build_assign (vec_dest, expr);
5689           new_temp = make_ssa_name (vec_dest, new_stmt);
5690           gimple_assign_set_lhs (new_stmt, new_temp);
5691           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5692
5693           if (slp_node)
5694             {
5695               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5696               vect_defs.quick_push (new_temp);
5697             }
5698           else
5699             vect_defs[0] = new_temp;
5700         }
5701
5702       if (slp_node)
5703         continue;
5704
5705       if (j == 0)
5706         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5707       else
5708         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5709
5710       prev_stmt_info = vinfo_for_stmt (new_stmt);
5711       prev_phi_info = vinfo_for_stmt (new_phi);
5712     }
5713
5714   tree indx_before_incr, indx_after_incr, cond_name = NULL;
5715
5716   /* Finalize the reduction-phi (set its arguments) and create the
5717      epilog reduction code.  */
5718   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5719     {
5720       new_temp = gimple_assign_lhs (*vec_stmt);
5721       vect_defs[0] = new_temp;
5722
5723       /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5724          which is updated with the current index of the loop for every match of
5725          the original loop's cond_expr (VEC_STMT).  This results in a vector
5726          containing the last time the condition passed for that vector lane.
5727          The first match will be a 1 to allow 0 to be used for non-matching
5728          indexes.  If there are no matches at all then the vector will be all
5729          zeroes.  */
5730       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
5731         {
5732           int nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
5733           int k;
5734
5735           gcc_assert (gimple_assign_rhs_code (*vec_stmt) == VEC_COND_EXPR);
5736
5737           /* First we create a simple vector induction variable which starts
5738              with the values {1,2,3,...} (SERIES_VECT) and increments by the
5739              vector size (STEP).  */
5740
5741           /* Create a {1,2,3,...} vector.  */
5742           tree *vtemp = XALLOCAVEC (tree, nunits_out);
5743           for (k = 0; k < nunits_out; ++k)
5744             vtemp[k] = build_int_cst (cr_index_scalar_type, k + 1);
5745           tree series_vect = build_vector (cr_index_vector_type, vtemp);
5746
5747           /* Create a vector of the step value.  */
5748           tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5749           tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5750
5751           /* Create an induction variable.  */
5752           gimple_stmt_iterator incr_gsi;
5753           bool insert_after;
5754           standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5755           create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5756                      insert_after, &indx_before_incr, &indx_after_incr);
5757
5758           /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5759              filled with zeros (VEC_ZERO).  */
5760
5761           /* Create a vector of 0s.  */
5762           tree zero = build_zero_cst (cr_index_scalar_type);
5763           tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5764
5765           /* Create a vector phi node.  */
5766           tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5767           new_phi = create_phi_node (new_phi_tree, loop->header);
5768           set_vinfo_for_stmt (new_phi,
5769                               new_stmt_vec_info (new_phi, loop_vinfo));
5770           add_phi_arg (new_phi, vec_zero, loop_preheader_edge (loop),
5771                        UNKNOWN_LOCATION);
5772
5773           /* Now take the condition from the loops original cond_expr
5774              (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
5775              every match uses values from the induction variable
5776              (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5777              (NEW_PHI_TREE).
5778              Finally, we update the phi (NEW_PHI_TREE) to take the value of
5779              the new cond_expr (INDEX_COND_EXPR).  */
5780
5781           /* Turn the condition from vec_stmt into an ssa name.  */
5782           gimple_stmt_iterator vec_stmt_gsi = gsi_for_stmt (*vec_stmt);
5783           tree ccompare = gimple_assign_rhs1 (*vec_stmt);
5784           tree ccompare_name = make_ssa_name (TREE_TYPE (ccompare));
5785           gimple *ccompare_stmt = gimple_build_assign (ccompare_name,
5786                                                        ccompare);
5787           gsi_insert_before (&vec_stmt_gsi, ccompare_stmt, GSI_SAME_STMT);
5788           gimple_assign_set_rhs1 (*vec_stmt, ccompare_name);
5789           update_stmt (*vec_stmt);
5790
5791           /* Create a conditional, where the condition is taken from vec_stmt
5792              (CCOMPARE_NAME), then is the induction index (INDEX_BEFORE_INCR)
5793              and else is the phi (NEW_PHI_TREE).  */
5794           tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
5795                                          ccompare_name, indx_before_incr,
5796                                          new_phi_tree);
5797           cond_name = make_ssa_name (cr_index_vector_type);
5798           gimple *index_condition = gimple_build_assign (cond_name,
5799                                                          index_cond_expr);
5800           gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
5801           stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
5802                                                             loop_vinfo);
5803           STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
5804           set_vinfo_for_stmt (index_condition, index_vec_info);
5805
5806           /* Update the phi with the vec cond.  */
5807           add_phi_arg (new_phi, cond_name, loop_latch_edge (loop),
5808                        UNKNOWN_LOCATION);
5809         }
5810     }
5811
5812   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5813                                     epilog_reduc_code, phis, reduc_index,
5814                                     double_reduc, slp_node, cond_name);
5815
5816   return true;
5817 }
5818
5819 /* Function vect_min_worthwhile_factor.
5820
5821    For a loop where we could vectorize the operation indicated by CODE,
5822    return the minimum vectorization factor that makes it worthwhile
5823    to use generic vectors.  */
5824 int
5825 vect_min_worthwhile_factor (enum tree_code code)
5826 {
5827   switch (code)
5828     {
5829     case PLUS_EXPR:
5830     case MINUS_EXPR:
5831     case NEGATE_EXPR:
5832       return 4;
5833
5834     case BIT_AND_EXPR:
5835     case BIT_IOR_EXPR:
5836     case BIT_XOR_EXPR:
5837     case BIT_NOT_EXPR:
5838       return 2;
5839
5840     default:
5841       return INT_MAX;
5842     }
5843 }
5844
5845
5846 /* Function vectorizable_induction
5847
5848    Check if PHI performs an induction computation that can be vectorized.
5849    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5850    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5851    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5852
5853 bool
5854 vectorizable_induction (gimple *phi,
5855                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5856                         gimple **vec_stmt)
5857 {
5858   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5859   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5860   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5861   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5862   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5863   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5864   tree vec_def;
5865
5866   gcc_assert (ncopies >= 1);
5867   /* FORNOW. These restrictions should be relaxed.  */
5868   if (nested_in_vect_loop_p (loop, phi))
5869     {
5870       imm_use_iterator imm_iter;
5871       use_operand_p use_p;
5872       gimple *exit_phi;
5873       edge latch_e;
5874       tree loop_arg;
5875
5876       if (ncopies > 1)
5877         {
5878           if (dump_enabled_p ())
5879             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5880                              "multiple types in nested loop.\n");
5881           return false;
5882         }
5883
5884       exit_phi = NULL;
5885       latch_e = loop_latch_edge (loop->inner);
5886       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5887       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5888         {
5889           gimple *use_stmt = USE_STMT (use_p);
5890           if (is_gimple_debug (use_stmt))
5891             continue;
5892
5893           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
5894             {
5895               exit_phi = use_stmt;
5896               break;
5897             }
5898         }
5899       if (exit_phi)
5900         {
5901           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5902           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5903                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5904             {
5905               if (dump_enabled_p ())
5906                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5907                                  "inner-loop induction only used outside "
5908                                  "of the outer vectorized loop.\n");
5909               return false;
5910             }
5911         }
5912     }
5913
5914   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5915     return false;
5916
5917   /* FORNOW: SLP not supported.  */
5918   if (STMT_SLP_TYPE (stmt_info))
5919     return false;
5920
5921   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5922
5923   if (gimple_code (phi) != GIMPLE_PHI)
5924     return false;
5925
5926   if (!vec_stmt) /* transformation not required.  */
5927     {
5928       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5929       if (dump_enabled_p ())
5930         dump_printf_loc (MSG_NOTE, vect_location,
5931                          "=== vectorizable_induction ===\n");
5932       vect_model_induction_cost (stmt_info, ncopies);
5933       return true;
5934     }
5935
5936   /** Transform.  **/
5937
5938   if (dump_enabled_p ())
5939     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5940
5941   vec_def = get_initial_def_for_induction (phi);
5942   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5943   return true;
5944 }
5945
5946 /* Function vectorizable_live_operation.
5947
5948    STMT computes a value that is used outside the loop.  Check if
5949    it can be supported.  */
5950
5951 bool
5952 vectorizable_live_operation (gimple *stmt,
5953                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5954                              gimple **vec_stmt)
5955 {
5956   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5957   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5958   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5959   tree op;
5960   gimple *def_stmt;
5961   ssa_op_iter iter;
5962
5963   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5964
5965   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5966     return false;
5967
5968   if (!is_gimple_assign (stmt))
5969     {
5970       if (gimple_call_internal_p (stmt)
5971           && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5972           && gimple_call_lhs (stmt)
5973           && loop->simduid
5974           && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5975           && loop->simduid
5976              == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5977         {
5978           edge e = single_exit (loop);
5979           basic_block merge_bb = e->dest;
5980           imm_use_iterator imm_iter;
5981           use_operand_p use_p;
5982           tree lhs = gimple_call_lhs (stmt);
5983
5984           FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5985             {
5986               gimple *use_stmt = USE_STMT (use_p);
5987               if (gimple_code (use_stmt) == GIMPLE_PHI
5988                   && gimple_bb (use_stmt) == merge_bb)
5989                 {
5990                   if (vec_stmt)
5991                     {
5992                       tree vfm1
5993                         = build_int_cst (unsigned_type_node,
5994                                          loop_vinfo->vectorization_factor - 1);
5995                       SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5996                     }
5997                   return true;
5998                 }
5999             }
6000         }
6001
6002       return false;
6003     }
6004
6005   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
6006     return false;
6007
6008   /* FORNOW. CHECKME. */
6009   if (nested_in_vect_loop_p (loop, stmt))
6010     return false;
6011
6012   /* FORNOW: support only if all uses are invariant.  This means
6013      that the scalar operations can remain in place, unvectorized.
6014      The original last scalar value that they compute will be used.  */
6015   FOR_EACH_SSA_TREE_OPERAND (op, stmt, iter, SSA_OP_USE)
6016     {
6017       enum vect_def_type dt = vect_uninitialized_def;
6018
6019       if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &dt))
6020         {
6021           if (dump_enabled_p ())
6022             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6023                              "use not simple.\n");
6024           return false;
6025         }
6026
6027       if (dt != vect_external_def && dt != vect_constant_def)
6028         return false;
6029     }
6030
6031   /* No transformation is required for the cases we currently support.  */
6032   return true;
6033 }
6034
6035 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
6036
6037 static void
6038 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
6039 {
6040   ssa_op_iter op_iter;
6041   imm_use_iterator imm_iter;
6042   def_operand_p def_p;
6043   gimple *ustmt;
6044
6045   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
6046     {
6047       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
6048         {
6049           basic_block bb;
6050
6051           if (!is_gimple_debug (ustmt))
6052             continue;
6053
6054           bb = gimple_bb (ustmt);
6055
6056           if (!flow_bb_inside_loop_p (loop, bb))
6057             {
6058               if (gimple_debug_bind_p (ustmt))
6059                 {
6060                   if (dump_enabled_p ())
6061                     dump_printf_loc (MSG_NOTE, vect_location,
6062                                      "killing debug use\n");
6063
6064                   gimple_debug_bind_reset_value (ustmt);
6065                   update_stmt (ustmt);
6066                 }
6067               else
6068                 gcc_unreachable ();
6069             }
6070         }
6071     }
6072 }
6073
6074
6075 /* This function builds ni_name = number of iterations.  Statements
6076    are emitted on the loop preheader edge.  */
6077
6078 static tree
6079 vect_build_loop_niters (loop_vec_info loop_vinfo)
6080 {
6081   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
6082   if (TREE_CODE (ni) == INTEGER_CST)
6083     return ni;
6084   else
6085     {
6086       tree ni_name, var;
6087       gimple_seq stmts = NULL;
6088       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6089
6090       var = create_tmp_var (TREE_TYPE (ni), "niters");
6091       ni_name = force_gimple_operand (ni, &stmts, false, var);
6092       if (stmts)
6093         gsi_insert_seq_on_edge_immediate (pe, stmts);
6094
6095       return ni_name;
6096     }
6097 }
6098
6099
6100 /* This function generates the following statements:
6101
6102    ni_name = number of iterations loop executes
6103    ratio = ni_name / vf
6104    ratio_mult_vf_name = ratio * vf
6105
6106    and places them on the loop preheader edge.  */
6107
6108 static void
6109 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
6110                                  tree ni_name,
6111                                  tree *ratio_mult_vf_name_ptr,
6112                                  tree *ratio_name_ptr)
6113 {
6114   tree ni_minus_gap_name;
6115   tree var;
6116   tree ratio_name;
6117   tree ratio_mult_vf_name;
6118   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6119   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6120   tree log_vf;
6121
6122   log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
6123
6124   /* If epilogue loop is required because of data accesses with gaps, we
6125      subtract one iteration from the total number of iterations here for
6126      correct calculation of RATIO.  */
6127   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6128     {
6129       ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6130                                        ni_name,
6131                                        build_one_cst (TREE_TYPE (ni_name)));
6132       if (!is_gimple_val (ni_minus_gap_name))
6133         {
6134           var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
6135           gimple *stmts = NULL;
6136           ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
6137                                                     true, var);
6138           gsi_insert_seq_on_edge_immediate (pe, stmts);
6139         }
6140     }
6141   else
6142     ni_minus_gap_name = ni_name;
6143
6144   /* Create: ratio = ni >> log2(vf) */
6145   /* ???  As we have ni == number of latch executions + 1, ni could
6146      have overflown to zero.  So avoid computing ratio based on ni
6147      but compute it using the fact that we know ratio will be at least
6148      one, thus via (ni - vf) >> log2(vf) + 1.  */
6149   ratio_name
6150     = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
6151                    fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
6152                                 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6153                                              ni_minus_gap_name,
6154                                              build_int_cst
6155                                                (TREE_TYPE (ni_name), vf)),
6156                                 log_vf),
6157                    build_int_cst (TREE_TYPE (ni_name), 1));
6158   if (!is_gimple_val (ratio_name))
6159     {
6160       var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
6161       gimple *stmts = NULL;
6162       ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
6163       gsi_insert_seq_on_edge_immediate (pe, stmts);
6164     }
6165   *ratio_name_ptr = ratio_name;
6166
6167   /* Create: ratio_mult_vf = ratio << log2 (vf).  */
6168
6169   if (ratio_mult_vf_name_ptr)
6170     {
6171       ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
6172                                         ratio_name, log_vf);
6173       if (!is_gimple_val (ratio_mult_vf_name))
6174         {
6175           var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
6176           gimple *stmts = NULL;
6177           ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
6178                                                      true, var);
6179           gsi_insert_seq_on_edge_immediate (pe, stmts);
6180         }
6181       *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
6182     }
6183
6184   return;
6185 }
6186
6187
6188 /* Function vect_transform_loop.
6189
6190    The analysis phase has determined that the loop is vectorizable.
6191    Vectorize the loop - created vectorized stmts to replace the scalar
6192    stmts in the loop, and update the loop exit condition.  */
6193
6194 void
6195 vect_transform_loop (loop_vec_info loop_vinfo)
6196 {
6197   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6198   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
6199   int nbbs = loop->num_nodes;
6200   int i;
6201   tree ratio = NULL;
6202   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6203   bool grouped_store;
6204   bool slp_scheduled = false;
6205   gimple *stmt, *pattern_stmt;
6206   gimple_seq pattern_def_seq = NULL;
6207   gimple_stmt_iterator pattern_def_si = gsi_none ();
6208   bool transform_pattern_stmt = false;
6209   bool check_profitability = false;
6210   int th;
6211   /* Record number of iterations before we started tampering with the profile. */
6212   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
6213
6214   if (dump_enabled_p ())
6215     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
6216
6217   /* If profile is inprecise, we have chance to fix it up.  */
6218   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6219     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
6220
6221   /* Use the more conservative vectorization threshold.  If the number
6222      of iterations is constant assume the cost check has been performed
6223      by our caller.  If the threshold makes all loops profitable that
6224      run at least the vectorization factor number of times checking
6225      is pointless, too.  */
6226   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
6227   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
6228       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6229     {
6230       if (dump_enabled_p ())
6231         dump_printf_loc (MSG_NOTE, vect_location,
6232                          "Profitability threshold is %d loop iterations.\n",
6233                          th);
6234       check_profitability = true;
6235     }
6236
6237   /* Version the loop first, if required, so the profitability check
6238      comes first.  */
6239
6240   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
6241       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
6242     {
6243       vect_loop_versioning (loop_vinfo, th, check_profitability);
6244       check_profitability = false;
6245     }
6246
6247   tree ni_name = vect_build_loop_niters (loop_vinfo);
6248   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
6249
6250   /* Peel the loop if there are data refs with unknown alignment.
6251      Only one data ref with unknown store is allowed.  */
6252
6253   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
6254     {
6255       vect_do_peeling_for_alignment (loop_vinfo, ni_name,
6256                                      th, check_profitability);
6257       check_profitability = false;
6258       /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
6259          be re-computed.  */
6260       ni_name = NULL_TREE;
6261     }
6262
6263   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
6264      compile time constant), or it is a constant that doesn't divide by the
6265      vectorization factor, then an epilog loop needs to be created.
6266      We therefore duplicate the loop: the original loop will be vectorized,
6267      and will compute the first (n/VF) iterations.  The second copy of the loop
6268      will remain scalar and will compute the remaining (n%VF) iterations.
6269      (VF is the vectorization factor).  */
6270
6271   if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
6272       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6273     {
6274       tree ratio_mult_vf;
6275       if (!ni_name)
6276         ni_name = vect_build_loop_niters (loop_vinfo);
6277       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
6278                                        &ratio);
6279       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
6280                                       th, check_profitability);
6281     }
6282   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6283     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
6284                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
6285   else
6286     {
6287       if (!ni_name)
6288         ni_name = vect_build_loop_niters (loop_vinfo);
6289       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
6290     }
6291
6292   /* 1) Make sure the loop header has exactly two entries
6293      2) Make sure we have a preheader basic block.  */
6294
6295   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
6296
6297   split_edge (loop_preheader_edge (loop));
6298
6299   /* FORNOW: the vectorizer supports only loops which body consist
6300      of one basic block (header + empty latch). When the vectorizer will
6301      support more involved loop forms, the order by which the BBs are
6302      traversed need to be reconsidered.  */
6303
6304   for (i = 0; i < nbbs; i++)
6305     {
6306       basic_block bb = bbs[i];
6307       stmt_vec_info stmt_info;
6308
6309       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6310            gsi_next (&si))
6311         {
6312           gphi *phi = si.phi ();
6313           if (dump_enabled_p ())
6314             {
6315               dump_printf_loc (MSG_NOTE, vect_location,
6316                                "------>vectorizing phi: ");
6317               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6318               dump_printf (MSG_NOTE, "\n");
6319             }
6320           stmt_info = vinfo_for_stmt (phi);
6321           if (!stmt_info)
6322             continue;
6323
6324           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6325             vect_loop_kill_debug_uses (loop, phi);
6326
6327           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6328               && !STMT_VINFO_LIVE_P (stmt_info))
6329             continue;
6330
6331           if (STMT_VINFO_VECTYPE (stmt_info)
6332               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6333                   != (unsigned HOST_WIDE_INT) vectorization_factor)
6334               && dump_enabled_p ())
6335             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6336
6337           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6338             {
6339               if (dump_enabled_p ())
6340                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6341               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6342             }
6343         }
6344
6345       pattern_stmt = NULL;
6346       for (gimple_stmt_iterator si = gsi_start_bb (bb);
6347            !gsi_end_p (si) || transform_pattern_stmt;)
6348         {
6349           bool is_store;
6350
6351           if (transform_pattern_stmt)
6352             stmt = pattern_stmt;
6353           else
6354             {
6355               stmt = gsi_stmt (si);
6356               /* During vectorization remove existing clobber stmts.  */
6357               if (gimple_clobber_p (stmt))
6358                 {
6359                   unlink_stmt_vdef (stmt);
6360                   gsi_remove (&si, true);
6361                   release_defs (stmt);
6362                   continue;
6363                 }
6364             }
6365
6366           if (dump_enabled_p ())
6367             {
6368               dump_printf_loc (MSG_NOTE, vect_location,
6369                                "------>vectorizing statement: ");
6370               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6371               dump_printf (MSG_NOTE, "\n");
6372             }
6373
6374           stmt_info = vinfo_for_stmt (stmt);
6375
6376           /* vector stmts created in the outer-loop during vectorization of
6377              stmts in an inner-loop may not have a stmt_info, and do not
6378              need to be vectorized.  */
6379           if (!stmt_info)
6380             {
6381               gsi_next (&si);
6382               continue;
6383             }
6384
6385           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6386             vect_loop_kill_debug_uses (loop, stmt);
6387
6388           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6389               && !STMT_VINFO_LIVE_P (stmt_info))
6390             {
6391               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6392                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6393                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6394                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6395                 {
6396                   stmt = pattern_stmt;
6397                   stmt_info = vinfo_for_stmt (stmt);
6398                 }
6399               else
6400                 {
6401                   gsi_next (&si);
6402                   continue;
6403                 }
6404             }
6405           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6406                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6407                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6408                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6409             transform_pattern_stmt = true;
6410
6411           /* If pattern statement has def stmts, vectorize them too.  */
6412           if (is_pattern_stmt_p (stmt_info))
6413             {
6414               if (pattern_def_seq == NULL)
6415                 {
6416                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6417                   pattern_def_si = gsi_start (pattern_def_seq);
6418                 }
6419               else if (!gsi_end_p (pattern_def_si))
6420                 gsi_next (&pattern_def_si);
6421               if (pattern_def_seq != NULL)
6422                 {
6423                   gimple *pattern_def_stmt = NULL;
6424                   stmt_vec_info pattern_def_stmt_info = NULL;
6425
6426                   while (!gsi_end_p (pattern_def_si))
6427                     {
6428                       pattern_def_stmt = gsi_stmt (pattern_def_si);
6429                       pattern_def_stmt_info
6430                         = vinfo_for_stmt (pattern_def_stmt);
6431                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6432                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6433                         break;
6434                       gsi_next (&pattern_def_si);
6435                     }
6436
6437                   if (!gsi_end_p (pattern_def_si))
6438                     {
6439                       if (dump_enabled_p ())
6440                         {
6441                           dump_printf_loc (MSG_NOTE, vect_location,
6442                                            "==> vectorizing pattern def "
6443                                            "stmt: ");
6444                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6445                                             pattern_def_stmt, 0);
6446                           dump_printf (MSG_NOTE, "\n");
6447                         }
6448
6449                       stmt = pattern_def_stmt;
6450                       stmt_info = pattern_def_stmt_info;
6451                     }
6452                   else
6453                     {
6454                       pattern_def_si = gsi_none ();
6455                       transform_pattern_stmt = false;
6456                     }
6457                 }
6458               else
6459                 transform_pattern_stmt = false;
6460             }
6461
6462           if (STMT_VINFO_VECTYPE (stmt_info))
6463             {
6464               unsigned int nunits
6465                 = (unsigned int)
6466                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6467               if (!STMT_SLP_TYPE (stmt_info)
6468                   && nunits != (unsigned int) vectorization_factor
6469                   && dump_enabled_p ())
6470                   /* For SLP VF is set according to unrolling factor, and not
6471                      to vector size, hence for SLP this print is not valid.  */
6472                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6473             }
6474
6475           /* SLP. Schedule all the SLP instances when the first SLP stmt is
6476              reached.  */
6477           if (STMT_SLP_TYPE (stmt_info))
6478             {
6479               if (!slp_scheduled)
6480                 {
6481                   slp_scheduled = true;
6482
6483                   if (dump_enabled_p ())
6484                     dump_printf_loc (MSG_NOTE, vect_location,
6485                                      "=== scheduling SLP instances ===\n");
6486
6487                   vect_schedule_slp (loop_vinfo);
6488                 }
6489
6490               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
6491               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6492                 {
6493                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6494                     {
6495                       pattern_def_seq = NULL;
6496                       gsi_next (&si);
6497                     }
6498                   continue;
6499                 }
6500             }
6501
6502           /* -------- vectorize statement ------------ */
6503           if (dump_enabled_p ())
6504             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6505
6506           grouped_store = false;
6507           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6508           if (is_store)
6509             {
6510               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6511                 {
6512                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6513                      interleaving chain was completed - free all the stores in
6514                      the chain.  */
6515                   gsi_next (&si);
6516                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6517                 }
6518               else
6519                 {
6520                   /* Free the attached stmt_vec_info and remove the stmt.  */
6521                   gimple *store = gsi_stmt (si);
6522                   free_stmt_vec_info (store);
6523                   unlink_stmt_vdef (store);
6524                   gsi_remove (&si, true);
6525                   release_defs (store);
6526                 }
6527
6528               /* Stores can only appear at the end of pattern statements.  */
6529               gcc_assert (!transform_pattern_stmt);
6530               pattern_def_seq = NULL;
6531             }
6532           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6533             {
6534               pattern_def_seq = NULL;
6535               gsi_next (&si);
6536             }
6537         }                       /* stmts in BB */
6538     }                           /* BBs in loop */
6539
6540   slpeel_make_loop_iterate_ntimes (loop, ratio);
6541
6542   /* Reduce loop iterations by the vectorization factor.  */
6543   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6544                       expected_iterations / vectorization_factor);
6545   loop->nb_iterations_upper_bound
6546     = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6547   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6548       && loop->nb_iterations_upper_bound != 0)
6549     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6550   if (loop->any_estimate)
6551     {
6552       loop->nb_iterations_estimate
6553         = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6554        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6555            && loop->nb_iterations_estimate != 0)
6556          loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6557     }
6558
6559   if (dump_enabled_p ())
6560     {
6561       dump_printf_loc (MSG_NOTE, vect_location,
6562                        "LOOP VECTORIZED\n");
6563       if (loop->inner)
6564         dump_printf_loc (MSG_NOTE, vect_location,
6565                          "OUTER LOOP VECTORIZED\n");
6566       dump_printf (MSG_NOTE, "\n");
6567     }
6568 }