gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2020 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "tree-scalar-evolution.h"
  47 #include "tree-vectorizer.h"
  48 #include "gimple-fold.h"
  49 #include "cgraph.h"
  50 #include "tree-cfg.h"
  51 #include "tree-if-conv.h"
  52 #include "internal-fn.h"
  53 #include "tree-vector-builder.h"
  54 #include "vec-perm-indices.h"
  55 #include "tree-eh.h"
  56
  57 /* Loop Vectorization Pass.
  58
  59    This pass tries to vectorize loops.
  60
  61    For example, the vectorizer transforms the following simple loop:
  62
  63         short a[N]; short b[N]; short c[N]; int i;
  64
  65         for (i=0; i<N; i++){
  66           a[i] = b[i] + c[i];
  67         }
  68
  69    as if it was manually vectorized by rewriting the source code into:
  70
  71         typedef int __attribute__((mode(V8HI))) v8hi;
  72         short a[N];  short b[N]; short c[N];   int i;
  73         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  74         v8hi va, vb, vc;
  75
  76         for (i=0; i<N/8; i++){
  77           vb = pb[i];
  78           vc = pc[i];
  79           va = vb + vc;
  80           pa[i] = va;
  81         }
  82
  83         The main entry to this pass is vectorize_loops(), in which
  84    the vectorizer applies a set of analyses on a given set of loops,
  85    followed by the actual vectorization transformation for the loops that
  86    had successfully passed the analysis phase.
  87         Throughout this pass we make a distinction between two types of
  88    data: scalars (which are represented by SSA_NAMES), and memory references
  89    ("data-refs").  These two types of data require different handling both
  90    during analysis and transformation. The types of data-refs that the
  91    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  92    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  93    accesses are required to have a simple (consecutive) access pattern.
  94
  95    Analysis phase:
  96    ===============
  97         The driver for the analysis phase is vect_analyze_loop().
  98    It applies a set of analyses, some of which rely on the scalar evolution
  99    analyzer (scev) developed by Sebastian Pop.
 100
 101         During the analysis phase the vectorizer records some information
 102    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 103    loop, as well as general information about the loop as a whole, which is
 104    recorded in a "loop_vec_info" struct attached to each loop.
 105
 106    Transformation phase:
 107    =====================
 108         The loop transformation phase scans all the stmts in the loop, and
 109    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 110    the loop that needs to be vectorized.  It inserts the vector code sequence
 111    just before the scalar stmt S, and records a pointer to the vector code
 112    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 113    attached to S).  This pointer will be used for the vectorization of following
 114    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 115    otherwise, we rely on dead code elimination for removing it.
 116
 117         For example, say stmt S1 was vectorized into stmt VS1:
 118
 119    VS1: vb = px[i];
 120    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 121    S2:  a = b;
 122
 123    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 124    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 125    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 126    resulting sequence would be:
 127
 128    VS1: vb = px[i];
 129    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 130    VS2: va = vb;
 131    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 132
 133         Operands that are not SSA_NAMEs, are data-refs that appear in
 134    load/store operations (like 'x[i]' in S1), and are handled differently.
 135
 136    Target modeling:
 137    =================
 138         Currently the only target specific information that is used is the
 139    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 140    Targets that can support different sizes of vectors, for now will need
 141    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 142    flexibility will be added in the future.
 143
 144         Since we only vectorize operations which vector form can be
 145    expressed using existing tree codes, to verify that an operation is
 146    supported, the vectorizer checks the relevant optab at the relevant
 147    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 148    the value found is CODE_FOR_nothing, then there's no target support, and
 149    we can't vectorize the stmt.
 150
 151    For additional information on this project see:
 152    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 153 */
 154
 155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 157                                                bool *, bool *);
 158
 159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 160    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 161    may already be set for general statements (not just data refs).  */
 162
 163 static opt_result
 164 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 165                               bool vectype_maybe_set_p,
 166                               poly_uint64 *vf)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return opt_result::success ();
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                                    &nunits_vectype);
 182   if (!res)
 183     return res;
 184
 185   if (stmt_vectype)
 186     {
 187       if (STMT_VINFO_VECTYPE (stmt_info))
 188         /* The only case when a vectype had been already set is for stmts
 189            that contain a data ref, or for "pattern-stmts" (stmts generated
 190            by the vectorizer to represent/replace a certain idiom).  */
 191         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 192                      || vectype_maybe_set_p)
 193                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 194       else
 195         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 196     }
 197
 198   if (nunits_vectype)
 199     vect_update_max_nunits (vf, nunits_vectype);
 200
 201   return opt_result::success ();
 202 }
 203
 204 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 205    types of STMT_INFO and all attached pattern statements and update
 206    the vectorization factor VF accordingly.  Return true on success
 207    or false if something prevented vectorization.  */
 208
 209 static opt_result
 210 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf)
 211 {
 212   vec_info *vinfo = stmt_info->vinfo;
 213   if (dump_enabled_p ())
 214     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 215                      stmt_info->stmt);
 216   opt_result res = vect_determine_vf_for_stmt_1 (stmt_info, false, vf);
 217   if (!res)
 218     return res;
 219
 220   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 221       && STMT_VINFO_RELATED_STMT (stmt_info))
 222     {
 223       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 224       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 225
 226       /* If a pattern statement has def stmts, analyze them too.  */
 227       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 228            !gsi_end_p (si); gsi_next (&si))
 229         {
 230           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 231           if (dump_enabled_p ())
 232             dump_printf_loc (MSG_NOTE, vect_location,
 233                              "==> examining pattern def stmt: %G",
 234                              def_stmt_info->stmt);
 235           res = vect_determine_vf_for_stmt_1 (def_stmt_info, true, vf);
 236           if (!res)
 237             return res;
 238         }
 239
 240       if (dump_enabled_p ())
 241         dump_printf_loc (MSG_NOTE, vect_location,
 242                          "==> examining pattern statement: %G",
 243                          stmt_info->stmt);
 244       res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf);
 245       if (!res)
 246         return res;
 247     }
 248
 249   return opt_result::success ();
 250 }
 251
 252 /* Function vect_determine_vectorization_factor
 253
 254    Determine the vectorization factor (VF).  VF is the number of data elements
 255    that are operated upon in parallel in a single iteration of the vectorized
 256    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 257    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 258    elements can fit in a single vector register.
 259
 260    We currently support vectorization of loops in which all types operated upon
 261    are of the same size.  Therefore this function currently sets VF according to
 262    the size of the types operated upon, and fails if there are multiple sizes
 263    in the loop.
 264
 265    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 266    original loop:
 267         for (i=0; i<N; i++){
 268           a[i] = b[i] + c[i];
 269         }
 270
 271    vectorized loop:
 272         for (i=0; i<N; i+=VF){
 273           a[i:VF] = b[i:VF] + c[i:VF];
 274         }
 275 */
 276
 277 static opt_result
 278 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 279 {
 280   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 281   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 282   unsigned nbbs = loop->num_nodes;
 283   poly_uint64 vectorization_factor = 1;
 284   tree scalar_type = NULL_TREE;
 285   gphi *phi;
 286   tree vectype;
 287   stmt_vec_info stmt_info;
 288   unsigned i;
 289
 290   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 291
 292   for (i = 0; i < nbbs; i++)
 293     {
 294       basic_block bb = bbs[i];
 295
 296       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 297            gsi_next (&si))
 298         {
 299           phi = si.phi ();
 300           stmt_info = loop_vinfo->lookup_stmt (phi);
 301           if (dump_enabled_p ())
 302             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 303                              phi);
 304
 305           gcc_assert (stmt_info);
 306
 307           if (STMT_VINFO_RELEVANT_P (stmt_info)
 308               || STMT_VINFO_LIVE_P (stmt_info))
 309             {
 310               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 311               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 312
 313               if (dump_enabled_p ())
 314                 dump_printf_loc (MSG_NOTE, vect_location,
 315                                  "get vectype for scalar type:  %T\n",
 316                                  scalar_type);
 317
 318               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 319               if (!vectype)
 320                 return opt_result::failure_at (phi,
 321                                                "not vectorized: unsupported "
 322                                                "data-type %T\n",
 323                                                scalar_type);
 324               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 325
 326               if (dump_enabled_p ())
 327                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 328                                  vectype);
 329
 330               if (dump_enabled_p ())
 331                 {
 332                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 333                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 334                   dump_printf (MSG_NOTE, "\n");
 335                 }
 336
 337               vect_update_max_nunits (&vectorization_factor, vectype);
 338             }
 339         }
 340
 341       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 342            gsi_next (&si))
 343         {
 344           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 345           opt_result res
 346             = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor);
 347           if (!res)
 348             return res;
 349         }
 350     }
 351
 352   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 353   if (dump_enabled_p ())
 354     {
 355       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 356       dump_dec (MSG_NOTE, vectorization_factor);
 357       dump_printf (MSG_NOTE, "\n");
 358     }
 359
 360   if (known_le (vectorization_factor, 1U))
 361     return opt_result::failure_at (vect_location,
 362                                    "not vectorized: unsupported data-type\n");
 363   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 364   return opt_result::success ();
 365 }
 366
 367
 368 /* Function vect_is_simple_iv_evolution.
 369
 370    FORNOW: A simple evolution of an induction variables in the loop is
 371    considered a polynomial evolution.  */
 372
 373 static bool
 374 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 375                              tree * step)
 376 {
 377   tree init_expr;
 378   tree step_expr;
 379   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 380   basic_block bb;
 381
 382   /* When there is no evolution in this loop, the evolution function
 383      is not "simple".  */
 384   if (evolution_part == NULL_TREE)
 385     return false;
 386
 387   /* When the evolution is a polynomial of degree >= 2
 388      the evolution function is not "simple".  */
 389   if (tree_is_chrec (evolution_part))
 390     return false;
 391
 392   step_expr = evolution_part;
 393   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 394
 395   if (dump_enabled_p ())
 396     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 397                      step_expr, init_expr);
 398
 399   *init = init_expr;
 400   *step = step_expr;
 401
 402   if (TREE_CODE (step_expr) != INTEGER_CST
 403       && (TREE_CODE (step_expr) != SSA_NAME
 404           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 405               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 406           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 407               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 408                   || !flag_associative_math)))
 409       && (TREE_CODE (step_expr) != REAL_CST
 410           || !flag_associative_math))
 411     {
 412       if (dump_enabled_p ())
 413         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 414                          "step unknown.\n");
 415       return false;
 416     }
 417
 418   return true;
 419 }
 420
 421 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 422    what we are assuming is a double reduction.  For example, given
 423    a structure like this:
 424
 425       outer1:
 426         x_1 = PHI <x_4(outer2), ...>;
 427         ...
 428
 429       inner:
 430         x_2 = PHI <x_1(outer1), ...>;
 431         ...
 432         x_3 = ...;
 433         ...
 434
 435       outer2:
 436         x_4 = PHI <x_3(inner)>;
 437         ...
 438
 439    outer loop analysis would treat x_1 as a double reduction phi and
 440    this function would then return true for x_2.  */
 441
 442 static bool
 443 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
 444 {
 445   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 446   use_operand_p use_p;
 447   ssa_op_iter op_iter;
 448   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 449     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 450       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 451         return true;
 452   return false;
 453 }
 454
 455 /* Function vect_analyze_scalar_cycles_1.
 456
 457    Examine the cross iteration def-use cycles of scalar variables
 458    in LOOP.  LOOP_VINFO represents the loop that is now being
 459    considered for vectorization (can be LOOP, or an outer-loop
 460    enclosing LOOP).  */
 461
 462 static void
 463 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
 464 {
 465   basic_block bb = loop->header;
 466   tree init, step;
 467   auto_vec<stmt_vec_info, 64> worklist;
 468   gphi_iterator gsi;
 469   bool double_reduc, reduc_chain;
 470
 471   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 472
 473   /* First - identify all inductions.  Reduction detection assumes that all the
 474      inductions have been identified, therefore, this order must not be
 475      changed.  */
 476   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 477     {
 478       gphi *phi = gsi.phi ();
 479       tree access_fn = NULL;
 480       tree def = PHI_RESULT (phi);
 481       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 482
 483       if (dump_enabled_p ())
 484         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 485
 486       /* Skip virtual phi's.  The data dependences that are associated with
 487          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 488       if (virtual_operand_p (def))
 489         continue;
 490
 491       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 492
 493       /* Analyze the evolution function.  */
 494       access_fn = analyze_scalar_evolution (loop, def);
 495       if (access_fn)
 496         {
 497           STRIP_NOPS (access_fn);
 498           if (dump_enabled_p ())
 499             dump_printf_loc (MSG_NOTE, vect_location,
 500                              "Access function of PHI: %T\n", access_fn);
 501           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 502             = initial_condition_in_loop_num (access_fn, loop->num);
 503           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 504             = evolution_part_in_loop_num (access_fn, loop->num);
 505         }
 506
 507       if (!access_fn
 508           || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
 509           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 510           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 511               && TREE_CODE (step) != INTEGER_CST))
 512         {
 513           worklist.safe_push (stmt_vinfo);
 514           continue;
 515         }
 516
 517       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 518                   != NULL_TREE);
 519       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 520
 521       if (dump_enabled_p ())
 522         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 523       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 524     }
 525
 526
 527   /* Second - identify all reductions and nested cycles.  */
 528   while (worklist.length () > 0)
 529     {
 530       stmt_vec_info stmt_vinfo = worklist.pop ();
 531       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 532       tree def = PHI_RESULT (phi);
 533
 534       if (dump_enabled_p ())
 535         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 536
 537       gcc_assert (!virtual_operand_p (def)
 538                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 539
 540       stmt_vec_info reduc_stmt_info
 541         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 542                                     &reduc_chain);
 543       if (reduc_stmt_info)
 544         {
 545           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 546           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 547           if (double_reduc)
 548             {
 549               if (dump_enabled_p ())
 550                 dump_printf_loc (MSG_NOTE, vect_location,
 551                                  "Detected double reduction.\n");
 552
 553               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 554               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 555             }
 556           else
 557             {
 558               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 559                 {
 560                   if (dump_enabled_p ())
 561                     dump_printf_loc (MSG_NOTE, vect_location,
 562                                      "Detected vectorizable nested cycle.\n");
 563
 564                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 565                 }
 566               else
 567                 {
 568                   if (dump_enabled_p ())
 569                     dump_printf_loc (MSG_NOTE, vect_location,
 570                                      "Detected reduction.\n");
 571
 572                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 573                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 574                   /* Store the reduction cycles for possible vectorization in
 575                      loop-aware SLP if it was not detected as reduction
 576                      chain.  */
 577                   if (! reduc_chain)
 578                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 579                       (reduc_stmt_info);
 580                 }
 581             }
 582         }
 583       else
 584         if (dump_enabled_p ())
 585           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 586                            "Unknown def-use cycle pattern.\n");
 587     }
 588 }
 589
 590
 591 /* Function vect_analyze_scalar_cycles.
 592
 593    Examine the cross iteration def-use cycles of scalar variables, by
 594    analyzing the loop-header PHIs of scalar variables.  Classify each
 595    cycle as one of the following: invariant, induction, reduction, unknown.
 596    We do that for the loop represented by LOOP_VINFO, and also to its
 597    inner-loop, if exists.
 598    Examples for scalar cycles:
 599
 600    Example1: reduction:
 601
 602               loop1:
 603               for (i=0; i<N; i++)
 604                  sum += a[i];
 605
 606    Example2: induction:
 607
 608               loop2:
 609               for (i=0; i<N; i++)
 610                  a[i] = i;  */
 611
 612 static void
 613 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 614 {
 615   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 616
 617   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 618
 619   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 620      Reductions in such inner-loop therefore have different properties than
 621      the reductions in the nest that gets vectorized:
 622      1. When vectorized, they are executed in the same order as in the original
 623         scalar loop, so we can't change the order of computation when
 624         vectorizing them.
 625      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 626         current checks are too strict.  */
 627
 628   if (loop->inner)
 629     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 630 }
 631
 632 /* Transfer group and reduction information from STMT_INFO to its
 633    pattern stmt.  */
 634
 635 static void
 636 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 637 {
 638   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 639   stmt_vec_info stmtp;
 640   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 641               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 642   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 643   do
 644     {
 645       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 646       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 647                            == STMT_VINFO_DEF_TYPE (stmt_info));
 648       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 649       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 650       if (stmt_info)
 651         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 652           = STMT_VINFO_RELATED_STMT (stmt_info);
 653     }
 654   while (stmt_info);
 655 }
 656
 657 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 658
 659 static void
 660 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 661 {
 662   stmt_vec_info first;
 663   unsigned i;
 664
 665   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 666     if (STMT_VINFO_IN_PATTERN_P (first))
 667       {
 668         stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 669         while (next)
 670           {
 671             if (! STMT_VINFO_IN_PATTERN_P (next)
 672                 || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
 673               break;
 674             next = REDUC_GROUP_NEXT_ELEMENT (next);
 675           }
 676         /* If not all stmt in the chain are patterns or if we failed
 677            to update STMT_VINFO_REDUC_IDX try to handle the chain
 678            without patterns.  */
 679         if (! next
 680             && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
 681           {
 682             vect_fixup_reduc_chain (first);
 683             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 684               = STMT_VINFO_RELATED_STMT (first);
 685           }
 686       }
 687 }
 688
 689 /* Function vect_get_loop_niters.
 690
 691    Determine how many iterations the loop is executed and place it
 692    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 693    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 694    niter information holds in ASSUMPTIONS.
 695
 696    Return the loop exit condition.  */
 697
 698
 699 static gcond *
 700 vect_get_loop_niters (class loop *loop, tree *assumptions,
 701                       tree *number_of_iterations, tree *number_of_iterationsm1)
 702 {
 703   edge exit = single_exit (loop);
 704   class tree_niter_desc niter_desc;
 705   tree niter_assumptions, niter, may_be_zero;
 706   gcond *cond = get_loop_exit_condition (loop);
 707
 708   *assumptions = boolean_true_node;
 709   *number_of_iterationsm1 = chrec_dont_know;
 710   *number_of_iterations = chrec_dont_know;
 711   DUMP_VECT_SCOPE ("get_loop_niters");
 712
 713   if (!exit)
 714     return cond;
 715
 716   may_be_zero = NULL_TREE;
 717   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 718       || chrec_contains_undetermined (niter_desc.niter))
 719     return cond;
 720
 721   niter_assumptions = niter_desc.assumptions;
 722   may_be_zero = niter_desc.may_be_zero;
 723   niter = niter_desc.niter;
 724
 725   if (may_be_zero && integer_zerop (may_be_zero))
 726     may_be_zero = NULL_TREE;
 727
 728   if (may_be_zero)
 729     {
 730       if (COMPARISON_CLASS_P (may_be_zero))
 731         {
 732           /* Try to combine may_be_zero with assumptions, this can simplify
 733              computation of niter expression.  */
 734           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 735             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 736                                              niter_assumptions,
 737                                              fold_build1 (TRUTH_NOT_EXPR,
 738                                                           boolean_type_node,
 739                                                           may_be_zero));
 740           else
 741             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 742                                  build_int_cst (TREE_TYPE (niter), 0),
 743                                  rewrite_to_non_trapping_overflow (niter));
 744
 745           may_be_zero = NULL_TREE;
 746         }
 747       else if (integer_nonzerop (may_be_zero))
 748         {
 749           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 750           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 751           return cond;
 752         }
 753       else
 754         return cond;
 755     }
 756
 757   *assumptions = niter_assumptions;
 758   *number_of_iterationsm1 = niter;
 759
 760   /* We want the number of loop header executions which is the number
 761      of latch executions plus one.
 762      ???  For UINT_MAX latch executions this number overflows to zero
 763      for loops like do { n++; } while (n != 0);  */
 764   if (niter && !chrec_contains_undetermined (niter))
 765     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 766                           build_int_cst (TREE_TYPE (niter), 1));
 767   *number_of_iterations = niter;
 768
 769   return cond;
 770 }
 771
 772 /* Function bb_in_loop_p
 773
 774    Used as predicate for dfs order traversal of the loop bbs.  */
 775
 776 static bool
 777 bb_in_loop_p (const_basic_block bb, const void *data)
 778 {
 779   const class loop *const loop = (const class loop *)data;
 780   if (flow_bb_inside_loop_p (loop, bb))
 781     return true;
 782   return false;
 783 }
 784
 785
 786 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 787    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 788
 789 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 790   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 791     loop (loop_in),
 792     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 793     num_itersm1 (NULL_TREE),
 794     num_iters (NULL_TREE),
 795     num_iters_unchanged (NULL_TREE),
 796     num_iters_assumptions (NULL_TREE),
 797     th (0),
 798     versioning_threshold (0),
 799     vectorization_factor (0),
 800     max_vectorization_factor (0),
 801     mask_skip_niters (NULL_TREE),
 802     mask_compare_type (NULL_TREE),
 803     simd_if_cond (NULL_TREE),
 804     unaligned_dr (NULL),
 805     peeling_for_alignment (0),
 806     ptr_mask (0),
 807     ivexpr_map (NULL),
 808     scan_map (NULL),
 809     slp_unrolling_factor (1),
 810     single_scalar_iteration_cost (0),
 811     vec_outside_cost (0),
 812     vec_inside_cost (0),
 813     vectorizable (false),
 814     can_fully_mask_p (true),
 815     fully_masked_p (false),
 816     peeling_for_gaps (false),
 817     peeling_for_niter (false),
 818     no_data_dependencies (false),
 819     has_mask_store (false),
 820     scalar_loop_scaling (profile_probability::uninitialized ()),
 821     scalar_loop (NULL),
 822     orig_loop_info (NULL)
 823 {
 824   /* CHECKME: We want to visit all BBs before their successors (except for
 825      latch blocks, for which this assertion wouldn't hold).  In the simple
 826      case of the loop forms we allow, a dfs order of the BBs would the same
 827      as reversed postorder traversal, so we are safe.  */
 828
 829   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 830                                           bbs, loop->num_nodes, loop);
 831   gcc_assert (nbbs == loop->num_nodes);
 832
 833   for (unsigned int i = 0; i < nbbs; i++)
 834     {
 835       basic_block bb = bbs[i];
 836       gimple_stmt_iterator si;
 837
 838       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 839         {
 840           gimple *phi = gsi_stmt (si);
 841           gimple_set_uid (phi, 0);
 842           add_stmt (phi);
 843         }
 844
 845       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 846         {
 847           gimple *stmt = gsi_stmt (si);
 848           gimple_set_uid (stmt, 0);
 849           add_stmt (stmt);
 850           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 851              third argument is the #pragma omp simd if (x) condition, when 0,
 852              loop shouldn't be vectorized, when non-zero constant, it should
 853              be vectorized normally, otherwise versioned with vectorized loop
 854              done if the condition is non-zero at runtime.  */
 855           if (loop_in->simduid
 856               && is_gimple_call (stmt)
 857               && gimple_call_internal_p (stmt)
 858               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 859               && gimple_call_num_args (stmt) >= 3
 860               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 861               && (loop_in->simduid
 862                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 863             {
 864               tree arg = gimple_call_arg (stmt, 2);
 865               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 866                 simd_if_cond = arg;
 867               else
 868                 gcc_assert (integer_nonzerop (arg));
 869             }
 870         }
 871     }
 872
 873   epilogue_vinfos.create (6);
 874 }
 875
 876 /* Free all levels of MASKS.  */
 877
 878 void
 879 release_vec_loop_masks (vec_loop_masks *masks)
 880 {
 881   rgroup_masks *rgm;
 882   unsigned int i;
 883   FOR_EACH_VEC_ELT (*masks, i, rgm)
 884     rgm->masks.release ();
 885   masks->release ();
 886 }
 887
 888 /* Free all memory used by the _loop_vec_info, as well as all the
 889    stmt_vec_info structs of all the stmts in the loop.  */
 890
 891 _loop_vec_info::~_loop_vec_info ()
 892 {
 893   free (bbs);
 894
 895   release_vec_loop_masks (&masks);
 896   delete ivexpr_map;
 897   delete scan_map;
 898   epilogue_vinfos.release ();
 899
 900   loop->aux = NULL;
 901 }
 902
 903 /* Return an invariant or register for EXPR and emit necessary
 904    computations in the LOOP_VINFO loop preheader.  */
 905
 906 tree
 907 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 908 {
 909   if (is_gimple_reg (expr)
 910       || is_gimple_min_invariant (expr))
 911     return expr;
 912
 913   if (! loop_vinfo->ivexpr_map)
 914     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 915   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 916   if (! cached)
 917     {
 918       gimple_seq stmts = NULL;
 919       cached = force_gimple_operand (unshare_expr (expr),
 920                                      &stmts, true, NULL_TREE);
 921       if (stmts)
 922         {
 923           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 924           gsi_insert_seq_on_edge_immediate (e, stmts);
 925         }
 926     }
 927   return cached;
 928 }
 929
 930 /* Return true if we can use CMP_TYPE as the comparison type to produce
 931    all masks required to mask LOOP_VINFO.  */
 932
 933 static bool
 934 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 935 {
 936   rgroup_masks *rgm;
 937   unsigned int i;
 938   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 939     if (rgm->mask_type != NULL_TREE
 940         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 941                                             cmp_type, rgm->mask_type,
 942                                             OPTIMIZE_FOR_SPEED))
 943       return false;
 944   return true;
 945 }
 946
 947 /* Calculate the maximum number of scalars per iteration for every
 948    rgroup in LOOP_VINFO.  */
 949
 950 static unsigned int
 951 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 952 {
 953   unsigned int res = 1;
 954   unsigned int i;
 955   rgroup_masks *rgm;
 956   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 957     res = MAX (res, rgm->max_nscalars_per_iter);
 958   return res;
 959 }
 960
 961 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
 962    whether we can actually generate the masks required.  Return true if so,
 963    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
 964
 965 static bool
 966 vect_verify_full_masking (loop_vec_info loop_vinfo)
 967 {
 968   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 969   unsigned int min_ni_width;
 970   unsigned int max_nscalars_per_iter
 971     = vect_get_max_nscalars_per_iter (loop_vinfo);
 972
 973   /* Use a normal loop if there are no statements that need masking.
 974      This only happens in rare degenerate cases: it means that the loop
 975      has no loads, no stores, and no live-out values.  */
 976   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
 977     return false;
 978
 979   /* Get the maximum number of iterations that is representable
 980      in the counter type.  */
 981   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
 982   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
 983
 984   /* Get a more refined estimate for the number of iterations.  */
 985   widest_int max_back_edges;
 986   if (max_loop_iterations (loop, &max_back_edges))
 987     max_ni = wi::smin (max_ni, max_back_edges + 1);
 988
 989   /* Account for rgroup masks, in which each bit is replicated N times.  */
 990   max_ni *= max_nscalars_per_iter;
 991
 992   /* Work out how many bits we need to represent the limit.  */
 993   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
 994
 995   /* Find a scalar mode for which WHILE_ULT is supported.  */
 996   opt_scalar_int_mode cmp_mode_iter;
 997   tree cmp_type = NULL_TREE;
 998   tree iv_type = NULL_TREE;
 999   widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
1000   unsigned int iv_precision = UINT_MAX;
1001
1002   if (iv_limit != -1)
1003     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1004                                       UNSIGNED);
1005
1006   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1007     {
1008       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1009       if (cmp_bits >= min_ni_width
1010           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1011         {
1012           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1013           if (this_type
1014               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1015             {
1016               /* Although we could stop as soon as we find a valid mode,
1017                  there are at least two reasons why that's not always the
1018                  best choice:
1019
1020                  - An IV that's Pmode or wider is more likely to be reusable
1021                    in address calculations than an IV that's narrower than
1022                    Pmode.
1023
1024                  - Doing the comparison in IV_PRECISION or wider allows
1025                    a natural 0-based IV, whereas using a narrower comparison
1026                    type requires mitigations against wrap-around.
1027
1028                  Conversely, if the IV limit is variable, doing the comparison
1029                  in a wider type than the original type can introduce
1030                  unnecessary extensions, so picking the widest valid mode
1031                  is not always a good choice either.
1032
1033                  Here we prefer the first IV type that's Pmode or wider,
1034                  and the first comparison type that's IV_PRECISION or wider.
1035                  (The comparison type must be no wider than the IV type,
1036                  to avoid extensions in the vector loop.)
1037
1038                  ??? We might want to try continuing beyond Pmode for ILP32
1039                  targets if CMP_BITS < IV_PRECISION.  */
1040               iv_type = this_type;
1041               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1042                 cmp_type = this_type;
1043               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1044                 break;
1045             }
1046         }
1047     }
1048
1049   if (!cmp_type)
1050     return false;
1051
1052   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1053   LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
1054   return true;
1055 }
1056
1057 /* Calculate the cost of one scalar iteration of the loop.  */
1058 static void
1059 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1060 {
1061   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1062   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1063   int nbbs = loop->num_nodes, factor;
1064   int innerloop_iters, i;
1065
1066   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1067
1068   /* Gather costs for statements in the scalar loop.  */
1069
1070   /* FORNOW.  */
1071   innerloop_iters = 1;
1072   if (loop->inner)
1073     innerloop_iters = 50; /* FIXME */
1074
1075   for (i = 0; i < nbbs; i++)
1076     {
1077       gimple_stmt_iterator si;
1078       basic_block bb = bbs[i];
1079
1080       if (bb->loop_father == loop->inner)
1081         factor = innerloop_iters;
1082       else
1083         factor = 1;
1084
1085       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1086         {
1087           gimple *stmt = gsi_stmt (si);
1088           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1089
1090           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1091             continue;
1092
1093           /* Skip stmts that are not vectorized inside the loop.  */
1094           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1095           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1096               && (!STMT_VINFO_LIVE_P (vstmt_info)
1097                   || !VECTORIZABLE_CYCLE_DEF
1098                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1099             continue;
1100
1101           vect_cost_for_stmt kind;
1102           if (STMT_VINFO_DATA_REF (stmt_info))
1103             {
1104               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1105                kind = scalar_load;
1106              else
1107                kind = scalar_store;
1108             }
1109           else if (vect_nop_conversion_p (stmt_info))
1110             continue;
1111           else
1112             kind = scalar_stmt;
1113
1114           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1115                             factor, kind, stmt_info, 0, vect_prologue);
1116         }
1117     }
1118
1119   /* Now accumulate cost.  */
1120   void *target_cost_data = init_cost (loop);
1121   stmt_info_for_cost *si;
1122   int j;
1123   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1124                     j, si)
1125     (void) add_stmt_cost (target_cost_data, si->count,
1126                           si->kind, si->stmt_info, si->misalign,
1127                           vect_body);
1128   unsigned dummy, body_cost = 0;
1129   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1130   destroy_cost_data (target_cost_data);
1131   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1132 }
1133
1134
1135 /* Function vect_analyze_loop_form_1.
1136
1137    Verify that certain CFG restrictions hold, including:
1138    - the loop has a pre-header
1139    - the loop has a single entry and exit
1140    - the loop exit condition is simple enough
1141    - the number of iterations can be analyzed, i.e, a countable loop.  The
1142      niter could be analyzed under some assumptions.  */
1143
1144 opt_result
1145 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1146                           tree *assumptions, tree *number_of_iterationsm1,
1147                           tree *number_of_iterations, gcond **inner_loop_cond)
1148 {
1149   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1150
1151   /* Different restrictions apply when we are considering an inner-most loop,
1152      vs. an outer (nested) loop.
1153      (FORNOW. May want to relax some of these restrictions in the future).  */
1154
1155   if (!loop->inner)
1156     {
1157       /* Inner-most loop.  We currently require that the number of BBs is
1158          exactly 2 (the header and latch).  Vectorizable inner-most loops
1159          look like this:
1160
1161                         (pre-header)
1162                            |
1163                           header <--------+
1164                            | |            |
1165                            | +--> latch --+
1166                            |
1167                         (exit-bb)  */
1168
1169       if (loop->num_nodes != 2)
1170         return opt_result::failure_at (vect_location,
1171                                        "not vectorized:"
1172                                        " control flow in loop.\n");
1173
1174       if (empty_block_p (loop->header))
1175         return opt_result::failure_at (vect_location,
1176                                        "not vectorized: empty loop.\n");
1177     }
1178   else
1179     {
1180       class loop *innerloop = loop->inner;
1181       edge entryedge;
1182
1183       /* Nested loop. We currently require that the loop is doubly-nested,
1184          contains a single inner loop, and the number of BBs is exactly 5.
1185          Vectorizable outer-loops look like this:
1186
1187                         (pre-header)
1188                            |
1189                           header <---+
1190                            |         |
1191                           inner-loop |
1192                            |         |
1193                           tail ------+
1194                            |
1195                         (exit-bb)
1196
1197          The inner-loop has the properties expected of inner-most loops
1198          as described above.  */
1199
1200       if ((loop->inner)->inner || (loop->inner)->next)
1201         return opt_result::failure_at (vect_location,
1202                                        "not vectorized:"
1203                                        " multiple nested loops.\n");
1204
1205       if (loop->num_nodes != 5)
1206         return opt_result::failure_at (vect_location,
1207                                        "not vectorized:"
1208                                        " control flow in loop.\n");
1209
1210       entryedge = loop_preheader_edge (innerloop);
1211       if (entryedge->src != loop->header
1212           || !single_exit (innerloop)
1213           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1214         return opt_result::failure_at (vect_location,
1215                                        "not vectorized:"
1216                                        " unsupported outerloop form.\n");
1217
1218       /* Analyze the inner-loop.  */
1219       tree inner_niterm1, inner_niter, inner_assumptions;
1220       opt_result res
1221         = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1222                                     &inner_assumptions, &inner_niterm1,
1223                                     &inner_niter, NULL);
1224       if (!res)
1225         {
1226           if (dump_enabled_p ())
1227             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1228                              "not vectorized: Bad inner loop.\n");
1229           return res;
1230         }
1231
1232       /* Don't support analyzing niter under assumptions for inner
1233          loop.  */
1234       if (!integer_onep (inner_assumptions))
1235         return opt_result::failure_at (vect_location,
1236                                        "not vectorized: Bad inner loop.\n");
1237
1238       if (!expr_invariant_in_loop_p (loop, inner_niter))
1239         return opt_result::failure_at (vect_location,
1240                                        "not vectorized: inner-loop count not"
1241                                        " invariant.\n");
1242
1243       if (dump_enabled_p ())
1244         dump_printf_loc (MSG_NOTE, vect_location,
1245                          "Considering outer-loop vectorization.\n");
1246     }
1247
1248   if (!single_exit (loop))
1249     return opt_result::failure_at (vect_location,
1250                                    "not vectorized: multiple exits.\n");
1251   if (EDGE_COUNT (loop->header->preds) != 2)
1252     return opt_result::failure_at (vect_location,
1253                                    "not vectorized:"
1254                                    " too many incoming edges.\n");
1255
1256   /* We assume that the loop exit condition is at the end of the loop. i.e,
1257      that the loop is represented as a do-while (with a proper if-guard
1258      before the loop if needed), where the loop header contains all the
1259      executable statements, and the latch is empty.  */
1260   if (!empty_block_p (loop->latch)
1261       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1262     return opt_result::failure_at (vect_location,
1263                                    "not vectorized: latch block not empty.\n");
1264
1265   /* Make sure the exit is not abnormal.  */
1266   edge e = single_exit (loop);
1267   if (e->flags & EDGE_ABNORMAL)
1268     return opt_result::failure_at (vect_location,
1269                                    "not vectorized:"
1270                                    " abnormal loop exit edge.\n");
1271
1272   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1273                                      number_of_iterationsm1);
1274   if (!*loop_cond)
1275     return opt_result::failure_at
1276       (vect_location,
1277        "not vectorized: complicated exit condition.\n");
1278
1279   if (integer_zerop (*assumptions)
1280       || !*number_of_iterations
1281       || chrec_contains_undetermined (*number_of_iterations))
1282     return opt_result::failure_at
1283       (*loop_cond,
1284        "not vectorized: number of iterations cannot be computed.\n");
1285
1286   if (integer_zerop (*number_of_iterations))
1287     return opt_result::failure_at
1288       (*loop_cond,
1289        "not vectorized: number of iterations = 0.\n");
1290
1291   return opt_result::success ();
1292 }
1293
1294 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1295
1296 opt_loop_vec_info
1297 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1298 {
1299   tree assumptions, number_of_iterations, number_of_iterationsm1;
1300   gcond *loop_cond, *inner_loop_cond = NULL;
1301
1302   opt_result res
1303     = vect_analyze_loop_form_1 (loop, &loop_cond,
1304                                 &assumptions, &number_of_iterationsm1,
1305                                 &number_of_iterations, &inner_loop_cond);
1306   if (!res)
1307     return opt_loop_vec_info::propagate_failure (res);
1308
1309   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1310   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1311   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1312   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1313   if (!integer_onep (assumptions))
1314     {
1315       /* We consider to vectorize this loop by versioning it under
1316          some assumptions.  In order to do this, we need to clear
1317          existing information computed by scev and niter analyzer.  */
1318       scev_reset_htab ();
1319       free_numbers_of_iterations_estimates (loop);
1320       /* Also set flag for this loop so that following scev and niter
1321          analysis are done under the assumptions.  */
1322       loop_constraint_set (loop, LOOP_C_FINITE);
1323       /* Also record the assumptions for versioning.  */
1324       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1325     }
1326
1327   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1328     {
1329       if (dump_enabled_p ())
1330         {
1331           dump_printf_loc (MSG_NOTE, vect_location,
1332                            "Symbolic number of iterations is ");
1333           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1334           dump_printf (MSG_NOTE, "\n");
1335         }
1336     }
1337
1338   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1339   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1340   if (inner_loop_cond)
1341     {
1342       stmt_vec_info inner_loop_cond_info
1343         = loop_vinfo->lookup_stmt (inner_loop_cond);
1344       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1345     }
1346
1347   gcc_assert (!loop->aux);
1348   loop->aux = loop_vinfo;
1349   return opt_loop_vec_info::success (loop_vinfo);
1350 }
1351
1352
1353
1354 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1355    statements update the vectorization factor.  */
1356
1357 static void
1358 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1359 {
1360   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1361   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1362   int nbbs = loop->num_nodes;
1363   poly_uint64 vectorization_factor;
1364   int i;
1365
1366   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1367
1368   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1369   gcc_assert (known_ne (vectorization_factor, 0U));
1370
1371   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1372      vectorization factor of the loop is the unrolling factor required by
1373      the SLP instances.  If that unrolling factor is 1, we say, that we
1374      perform pure SLP on loop - cross iteration parallelism is not
1375      exploited.  */
1376   bool only_slp_in_loop = true;
1377   for (i = 0; i < nbbs; i++)
1378     {
1379       basic_block bb = bbs[i];
1380       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1381            gsi_next (&si))
1382         {
1383           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1384           if (!stmt_info)
1385             continue;
1386           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1387                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1388               && !PURE_SLP_STMT (stmt_info))
1389             /* STMT needs both SLP and loop-based vectorization.  */
1390             only_slp_in_loop = false;
1391         }
1392       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1393            gsi_next (&si))
1394         {
1395           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1396           stmt_info = vect_stmt_to_vectorize (stmt_info);
1397           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1398                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1399               && !PURE_SLP_STMT (stmt_info))
1400             /* STMT needs both SLP and loop-based vectorization.  */
1401             only_slp_in_loop = false;
1402         }
1403     }
1404
1405   if (only_slp_in_loop)
1406     {
1407       if (dump_enabled_p ())
1408         dump_printf_loc (MSG_NOTE, vect_location,
1409                          "Loop contains only SLP stmts\n");
1410       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1411     }
1412   else
1413     {
1414       if (dump_enabled_p ())
1415         dump_printf_loc (MSG_NOTE, vect_location,
1416                          "Loop contains SLP and non-SLP stmts\n");
1417       /* Both the vectorization factor and unroll factor have the form
1418          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1419          so they must have a common multiple.  */
1420       vectorization_factor
1421         = force_common_multiple (vectorization_factor,
1422                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1423     }
1424
1425   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1426   if (dump_enabled_p ())
1427     {
1428       dump_printf_loc (MSG_NOTE, vect_location,
1429                        "Updating vectorization factor to ");
1430       dump_dec (MSG_NOTE, vectorization_factor);
1431       dump_printf (MSG_NOTE, ".\n");
1432     }
1433 }
1434
1435 /* Return true if STMT_INFO describes a double reduction phi and if
1436    the other phi in the reduction is also relevant for vectorization.
1437    This rejects cases such as:
1438
1439       outer1:
1440         x_1 = PHI <x_3(outer2), ...>;
1441         ...
1442
1443       inner:
1444         x_2 = ...;
1445         ...
1446
1447       outer2:
1448         x_3 = PHI <x_2(inner)>;
1449
1450    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1451
1452 static bool
1453 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1454 {
1455   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1456     return false;
1457
1458   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1459 }
1460
1461 /* Function vect_analyze_loop_operations.
1462
1463    Scan the loop stmts and make sure they are all vectorizable.  */
1464
1465 static opt_result
1466 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1467 {
1468   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1469   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1470   int nbbs = loop->num_nodes;
1471   int i;
1472   stmt_vec_info stmt_info;
1473   bool need_to_vectorize = false;
1474   bool ok;
1475
1476   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1477
1478   auto_vec<stmt_info_for_cost> cost_vec;
1479
1480   for (i = 0; i < nbbs; i++)
1481     {
1482       basic_block bb = bbs[i];
1483
1484       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1485            gsi_next (&si))
1486         {
1487           gphi *phi = si.phi ();
1488           ok = true;
1489
1490           stmt_info = loop_vinfo->lookup_stmt (phi);
1491           if (dump_enabled_p ())
1492             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1493           if (virtual_operand_p (gimple_phi_result (phi)))
1494             continue;
1495
1496           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1497              (i.e., a phi in the tail of the outer-loop).  */
1498           if (! is_loop_header_bb_p (bb))
1499             {
1500               /* FORNOW: we currently don't support the case that these phis
1501                  are not used in the outerloop (unless it is double reduction,
1502                  i.e., this phi is vect_reduction_def), cause this case
1503                  requires to actually do something here.  */
1504               if (STMT_VINFO_LIVE_P (stmt_info)
1505                   && !vect_active_double_reduction_p (stmt_info))
1506                 return opt_result::failure_at (phi,
1507                                                "Unsupported loop-closed phi"
1508                                                " in outer-loop.\n");
1509
1510               /* If PHI is used in the outer loop, we check that its operand
1511                  is defined in the inner loop.  */
1512               if (STMT_VINFO_RELEVANT_P (stmt_info))
1513                 {
1514                   tree phi_op;
1515
1516                   if (gimple_phi_num_args (phi) != 1)
1517                     return opt_result::failure_at (phi, "unsupported phi");
1518
1519                   phi_op = PHI_ARG_DEF (phi, 0);
1520                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1521                   if (!op_def_info)
1522                     return opt_result::failure_at (phi, "unsupported phi\n");
1523
1524                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1525                       && (STMT_VINFO_RELEVANT (op_def_info)
1526                           != vect_used_in_outer_by_reduction))
1527                     return opt_result::failure_at (phi, "unsupported phi\n");
1528
1529                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1530                        || (STMT_VINFO_DEF_TYPE (stmt_info)
1531                            == vect_double_reduction_def))
1532                       && !vectorizable_lc_phi (stmt_info, NULL, NULL))
1533                     return opt_result::failure_at (phi, "unsupported phi\n");
1534                 }
1535
1536               continue;
1537             }
1538
1539           gcc_assert (stmt_info);
1540
1541           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1542                || STMT_VINFO_LIVE_P (stmt_info))
1543               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1544             /* A scalar-dependence cycle that we don't support.  */
1545             return opt_result::failure_at (phi,
1546                                            "not vectorized:"
1547                                            " scalar dependence cycle.\n");
1548
1549           if (STMT_VINFO_RELEVANT_P (stmt_info))
1550             {
1551               need_to_vectorize = true;
1552               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1553                   && ! PURE_SLP_STMT (stmt_info))
1554                 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1555                                              &cost_vec);
1556               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1557                         || (STMT_VINFO_DEF_TYPE (stmt_info)
1558                             == vect_double_reduction_def)
1559                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1560                        && ! PURE_SLP_STMT (stmt_info))
1561                 ok = vectorizable_reduction (stmt_info, NULL, NULL, &cost_vec);
1562             }
1563
1564           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1565           if (ok
1566               && STMT_VINFO_LIVE_P (stmt_info)
1567               && !PURE_SLP_STMT (stmt_info))
1568             ok = vectorizable_live_operation (stmt_info, NULL, NULL, NULL,
1569                                               -1, false, &cost_vec);
1570
1571           if (!ok)
1572             return opt_result::failure_at (phi,
1573                                            "not vectorized: relevant phi not "
1574                                            "supported: %G",
1575                                            static_cast <gimple *> (phi));
1576         }
1577
1578       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1579            gsi_next (&si))
1580         {
1581           gimple *stmt = gsi_stmt (si);
1582           if (!gimple_clobber_p (stmt))
1583             {
1584               opt_result res
1585                 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1586                                      &need_to_vectorize,
1587                                      NULL, NULL, &cost_vec);
1588               if (!res)
1589                 return res;
1590             }
1591         }
1592     } /* bbs */
1593
1594   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1595
1596   /* All operations in the loop are either irrelevant (deal with loop
1597      control, or dead), or only used outside the loop and can be moved
1598      out of the loop (e.g. invariants, inductions).  The loop can be
1599      optimized away by scalar optimizations.  We're better off not
1600      touching this loop.  */
1601   if (!need_to_vectorize)
1602     {
1603       if (dump_enabled_p ())
1604         dump_printf_loc (MSG_NOTE, vect_location,
1605                          "All the computation can be taken out of the loop.\n");
1606       return opt_result::failure_at
1607         (vect_location,
1608          "not vectorized: redundant loop. no profit to vectorize.\n");
1609     }
1610
1611   return opt_result::success ();
1612 }
1613
1614 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1615    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1616    definitely no, or -1 if it's worth retrying.  */
1617
1618 static int
1619 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1620 {
1621   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1622   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1623
1624   /* Only fully-masked loops can have iteration counts less than the
1625      vectorization factor.  */
1626   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1627     {
1628       HOST_WIDE_INT max_niter;
1629
1630       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1631         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1632       else
1633         max_niter = max_stmt_executions_int (loop);
1634
1635       if (max_niter != -1
1636           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1637         {
1638           if (dump_enabled_p ())
1639             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1640                              "not vectorized: iteration count smaller than "
1641                              "vectorization factor.\n");
1642           return 0;
1643         }
1644     }
1645
1646   int min_profitable_iters, min_profitable_estimate;
1647   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1648                                       &min_profitable_estimate);
1649
1650   if (min_profitable_iters < 0)
1651     {
1652       if (dump_enabled_p ())
1653         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1654                          "not vectorized: vectorization not profitable.\n");
1655       if (dump_enabled_p ())
1656         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1657                          "not vectorized: vector version will never be "
1658                          "profitable.\n");
1659       return -1;
1660     }
1661
1662   int min_scalar_loop_bound = (param_min_vect_loop_bound
1663                                * assumed_vf);
1664
1665   /* Use the cost model only if it is more conservative than user specified
1666      threshold.  */
1667   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1668                                     min_profitable_iters);
1669
1670   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1671
1672   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1673       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1674     {
1675       if (dump_enabled_p ())
1676         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1677                          "not vectorized: vectorization not profitable.\n");
1678       if (dump_enabled_p ())
1679         dump_printf_loc (MSG_NOTE, vect_location,
1680                          "not vectorized: iteration count smaller than user "
1681                          "specified loop bound parameter or minimum profitable "
1682                          "iterations (whichever is more conservative).\n");
1683       return 0;
1684     }
1685
1686   /* The static profitablity threshold min_profitable_estimate includes
1687      the cost of having to check at runtime whether the scalar loop
1688      should be used instead.  If it turns out that we don't need or want
1689      such a check, the threshold we should use for the static estimate
1690      is simply the point at which the vector loop becomes more profitable
1691      than the scalar loop.  */
1692   if (min_profitable_estimate > min_profitable_iters
1693       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1694       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1695       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1696       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1697     {
1698       if (dump_enabled_p ())
1699         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1700                          " choice between the scalar and vector loops\n");
1701       min_profitable_estimate = min_profitable_iters;
1702     }
1703
1704   HOST_WIDE_INT estimated_niter;
1705
1706   /* If we are vectorizing an epilogue then we know the maximum number of
1707      scalar iterations it will cover is at least one lower than the
1708      vectorization factor of the main loop.  */
1709   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1710     estimated_niter
1711       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1712   else
1713     {
1714       estimated_niter = estimated_stmt_executions_int (loop);
1715       if (estimated_niter == -1)
1716         estimated_niter = likely_max_stmt_executions_int (loop);
1717     }
1718   if (estimated_niter != -1
1719       && ((unsigned HOST_WIDE_INT) estimated_niter
1720           < MAX (th, (unsigned) min_profitable_estimate)))
1721     {
1722       if (dump_enabled_p ())
1723         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1724                          "not vectorized: estimated iteration count too "
1725                          "small.\n");
1726       if (dump_enabled_p ())
1727         dump_printf_loc (MSG_NOTE, vect_location,
1728                          "not vectorized: estimated iteration count smaller "
1729                          "than specified loop bound parameter or minimum "
1730                          "profitable iterations (whichever is more "
1731                          "conservative).\n");
1732       return -1;
1733     }
1734
1735   return 1;
1736 }
1737
1738 static opt_result
1739 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1740                            vec<data_reference_p> *datarefs,
1741                            unsigned int *n_stmts)
1742 {
1743   *n_stmts = 0;
1744   for (unsigned i = 0; i < loop->num_nodes; i++)
1745     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1746          !gsi_end_p (gsi); gsi_next (&gsi))
1747       {
1748         gimple *stmt = gsi_stmt (gsi);
1749         if (is_gimple_debug (stmt))
1750           continue;
1751         ++(*n_stmts);
1752         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1753         if (!res)
1754           {
1755             if (is_gimple_call (stmt) && loop->safelen)
1756               {
1757                 tree fndecl = gimple_call_fndecl (stmt), op;
1758                 if (fndecl != NULL_TREE)
1759                   {
1760                     cgraph_node *node = cgraph_node::get (fndecl);
1761                     if (node != NULL && node->simd_clones != NULL)
1762                       {
1763                         unsigned int j, n = gimple_call_num_args (stmt);
1764                         for (j = 0; j < n; j++)
1765                           {
1766                             op = gimple_call_arg (stmt, j);
1767                             if (DECL_P (op)
1768                                 || (REFERENCE_CLASS_P (op)
1769                                     && get_base_address (op)))
1770                               break;
1771                           }
1772                         op = gimple_call_lhs (stmt);
1773                         /* Ignore #pragma omp declare simd functions
1774                            if they don't have data references in the
1775                            call stmt itself.  */
1776                         if (j == n
1777                             && !(op
1778                                  && (DECL_P (op)
1779                                      || (REFERENCE_CLASS_P (op)
1780                                          && get_base_address (op)))))
1781                           continue;
1782                       }
1783                   }
1784               }
1785             return res;
1786           }
1787         /* If dependence analysis will give up due to the limit on the
1788            number of datarefs stop here and fail fatally.  */
1789         if (datarefs->length ()
1790             > (unsigned)param_loop_max_datarefs_for_datadeps)
1791           return opt_result::failure_at (stmt, "exceeded param "
1792                                          "loop-max-datarefs-for-datadeps\n");
1793       }
1794   return opt_result::success ();
1795 }
1796
1797 /* Look for SLP-only access groups and turn each individual access into its own
1798    group.  */
1799 static void
1800 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1801 {
1802   unsigned int i;
1803   struct data_reference *dr;
1804
1805   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1806
1807   vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
1808   FOR_EACH_VEC_ELT (datarefs, i, dr)
1809     {
1810       gcc_assert (DR_REF (dr));
1811       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1812
1813       /* Check if the load is a part of an interleaving chain.  */
1814       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1815         {
1816           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1817           unsigned int group_size = DR_GROUP_SIZE (first_element);
1818
1819           /* Check if SLP-only groups.  */
1820           if (!STMT_SLP_TYPE (stmt_info)
1821               && STMT_VINFO_SLP_VECT_ONLY (first_element))
1822             {
1823               /* Dissolve the group.  */
1824               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1825
1826               stmt_vec_info vinfo = first_element;
1827               while (vinfo)
1828                 {
1829                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1830                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1831                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1832                   DR_GROUP_SIZE (vinfo) = 1;
1833                   if (STMT_VINFO_STRIDED_P (first_element))
1834                     DR_GROUP_GAP (vinfo) = 0;
1835                   else
1836                     DR_GROUP_GAP (vinfo) = group_size - 1;
1837                   vinfo = next;
1838                 }
1839             }
1840         }
1841     }
1842 }
1843
1844
1845 /* Decides whether we need to create an epilogue loop to handle
1846    remaining scalar iterations and sets PEELING_FOR_NITERS accordingly.  */
1847
1848 void
1849 determine_peel_for_niter (loop_vec_info loop_vinfo)
1850 {
1851   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1852
1853   unsigned HOST_WIDE_INT const_vf;
1854   HOST_WIDE_INT max_niter
1855     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1856
1857   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1858   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1859     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1860                                           (loop_vinfo));
1861
1862   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1863     /* The main loop handles all iterations.  */
1864     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1865   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1866            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1867     {
1868       /* Work out the (constant) number of iterations that need to be
1869          peeled for reasons other than niters.  */
1870       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1871       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1872         peel_niter += 1;
1873       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1874                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1875         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1876     }
1877   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1878            /* ??? When peeling for gaps but not alignment, we could
1879               try to check whether the (variable) niters is known to be
1880               VF * N + 1.  That's something of a niche case though.  */
1881            || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1882            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1883            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1884                 < (unsigned) exact_log2 (const_vf))
1885                /* In case of versioning, check if the maximum number of
1886                   iterations is greater than th.  If they are identical,
1887                   the epilogue is unnecessary.  */
1888                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1889                    || ((unsigned HOST_WIDE_INT) max_niter
1890                        > (th / const_vf) * const_vf))))
1891     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1892 }
1893
1894
1895 /* Function vect_analyze_loop_2.
1896
1897    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1898    for it.  The different analyses will record information in the
1899    loop_vec_info struct.  */
1900 static opt_result
1901 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1902 {
1903   opt_result ok = opt_result::success ();
1904   int res;
1905   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1906   poly_uint64 min_vf = 2;
1907   loop_vec_info orig_loop_vinfo = NULL;
1908
1909   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
1910      loop_vec_info of the first vectorized loop.  */
1911   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1912     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1913   else
1914     orig_loop_vinfo = loop_vinfo;
1915   gcc_assert (orig_loop_vinfo);
1916
1917   /* The first group of checks is independent of the vector size.  */
1918   fatal = true;
1919
1920   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1921       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1922     return opt_result::failure_at (vect_location,
1923                                    "not vectorized: simd if(0)\n");
1924
1925   /* Find all data references in the loop (which correspond to vdefs/vuses)
1926      and analyze their evolution in the loop.  */
1927
1928   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1929
1930   /* Gather the data references and count stmts in the loop.  */
1931   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1932     {
1933       opt_result res
1934         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1935                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
1936                                      n_stmts);
1937       if (!res)
1938         {
1939           if (dump_enabled_p ())
1940             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1941                              "not vectorized: loop contains function "
1942                              "calls or data references that cannot "
1943                              "be analyzed\n");
1944           return res;
1945         }
1946       loop_vinfo->shared->save_datarefs ();
1947     }
1948   else
1949     loop_vinfo->shared->check_datarefs ();
1950
1951   /* Analyze the data references and also adjust the minimal
1952      vectorization factor according to the loads and stores.  */
1953
1954   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
1955   if (!ok)
1956     {
1957       if (dump_enabled_p ())
1958         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1959                          "bad data references.\n");
1960       return ok;
1961     }
1962
1963   /* Classify all cross-iteration scalar data-flow cycles.
1964      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1965   vect_analyze_scalar_cycles (loop_vinfo);
1966
1967   vect_pattern_recog (loop_vinfo);
1968
1969   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1970
1971   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1972      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1973
1974   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1975   if (!ok)
1976     {
1977       if (dump_enabled_p ())
1978         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1979                          "bad data access.\n");
1980       return ok;
1981     }
1982
1983   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1984
1985   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
1986   if (!ok)
1987     {
1988       if (dump_enabled_p ())
1989         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1990                          "unexpected pattern.\n");
1991       return ok;
1992     }
1993
1994   /* While the rest of the analysis below depends on it in some way.  */
1995   fatal = false;
1996
1997   /* Analyze data dependences between the data-refs in the loop
1998      and adjust the maximum vectorization factor according to
1999      the dependences.
2000      FORNOW: fail at the first data dependence that we encounter.  */
2001
2002   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2003   if (!ok)
2004     {
2005       if (dump_enabled_p ())
2006         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2007                          "bad data dependence.\n");
2008       return ok;
2009     }
2010   if (max_vf != MAX_VECTORIZATION_FACTOR
2011       && maybe_lt (max_vf, min_vf))
2012     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2013   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2014
2015   ok = vect_determine_vectorization_factor (loop_vinfo);
2016   if (!ok)
2017     {
2018       if (dump_enabled_p ())
2019         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2020                          "can't determine vectorization factor.\n");
2021       return ok;
2022     }
2023   if (max_vf != MAX_VECTORIZATION_FACTOR
2024       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2025     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2026
2027   /* Compute the scalar iteration cost.  */
2028   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2029
2030   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2031
2032   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2033   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2034   if (!ok)
2035     return ok;
2036
2037   /* If there are any SLP instances mark them as pure_slp.  */
2038   bool slp = vect_make_slp_decision (loop_vinfo);
2039   if (slp)
2040     {
2041       /* Find stmts that need to be both vectorized and SLPed.  */
2042       vect_detect_hybrid_slp (loop_vinfo);
2043
2044       /* Update the vectorization factor based on the SLP decision.  */
2045       vect_update_vf_for_slp (loop_vinfo);
2046     }
2047
2048   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2049
2050   /* We don't expect to have to roll back to anything other than an empty
2051      set of rgroups.  */
2052   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2053
2054   /* This is the point where we can re-start analysis with SLP forced off.  */
2055 start_over:
2056
2057   /* Now the vectorization factor is final.  */
2058   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2059   gcc_assert (known_ne (vectorization_factor, 0U));
2060
2061   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2062     {
2063       dump_printf_loc (MSG_NOTE, vect_location,
2064                        "vectorization_factor = ");
2065       dump_dec (MSG_NOTE, vectorization_factor);
2066       dump_printf (MSG_NOTE, ", niters = %wd\n",
2067                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2068     }
2069
2070   /* Analyze the alignment of the data-refs in the loop.
2071      Fail if a data reference is found that cannot be vectorized.  */
2072
2073   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2074   if (!ok)
2075     {
2076       if (dump_enabled_p ())
2077         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2078                          "bad data alignment.\n");
2079       return ok;
2080     }
2081
2082   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2083      It is important to call pruning after vect_analyze_data_ref_accesses,
2084      since we use grouping information gathered by interleaving analysis.  */
2085   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2086   if (!ok)
2087     return ok;
2088
2089   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2090      vectorization, since we do not want to add extra peeling or
2091      add versioning for alignment.  */
2092   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2093     /* This pass will decide on using loop versioning and/or loop peeling in
2094        order to enhance the alignment of data references in the loop.  */
2095     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2096   else
2097     ok = vect_verify_datarefs_alignment (loop_vinfo);
2098   if (!ok)
2099     return ok;
2100
2101   if (slp)
2102     {
2103       /* Analyze operations in the SLP instances.  Note this may
2104          remove unsupported SLP instances which makes the above
2105          SLP kind detection invalid.  */
2106       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2107       vect_slp_analyze_operations (loop_vinfo);
2108       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2109         {
2110           ok = opt_result::failure_at (vect_location,
2111                                        "unsupported SLP instances\n");
2112           goto again;
2113         }
2114     }
2115
2116   /* Dissolve SLP-only groups.  */
2117   vect_dissolve_slp_only_groups (loop_vinfo);
2118
2119   /* Scan all the remaining operations in the loop that are not subject
2120      to SLP and make sure they are vectorizable.  */
2121   ok = vect_analyze_loop_operations (loop_vinfo);
2122   if (!ok)
2123     {
2124       if (dump_enabled_p ())
2125         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2126                          "bad operation or unsupported loop bound.\n");
2127       return ok;
2128     }
2129
2130   /* Decide whether to use a fully-masked loop for this vectorization
2131      factor.  */
2132   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2133     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2134        && vect_verify_full_masking (loop_vinfo));
2135   if (dump_enabled_p ())
2136     {
2137       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2138         dump_printf_loc (MSG_NOTE, vect_location,
2139                          "using a fully-masked loop.\n");
2140       else
2141         dump_printf_loc (MSG_NOTE, vect_location,
2142                          "not using a fully-masked loop.\n");
2143     }
2144
2145   /* If epilog loop is required because of data accesses with gaps,
2146      one additional iteration needs to be peeled.  Check if there is
2147      enough iterations for vectorization.  */
2148   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2149       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2150       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2151     {
2152       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2153       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2154
2155       if (known_lt (wi::to_widest (scalar_niters), vf))
2156         return opt_result::failure_at (vect_location,
2157                                        "loop has no enough iterations to"
2158                                        " support peeling for gaps.\n");
2159     }
2160
2161   /* If we're vectorizing an epilogue loop, we either need a fully-masked
2162      loop or a loop that has a lower VF than the main loop.  */
2163   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2164       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2165       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2166                    LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2167     return opt_result::failure_at (vect_location,
2168                                    "Vectorization factor too high for"
2169                                    " epilogue loop.\n");
2170
2171   /* Check the costings of the loop make vectorizing worthwhile.  */
2172   res = vect_analyze_loop_costing (loop_vinfo);
2173   if (res < 0)
2174     {
2175       ok = opt_result::failure_at (vect_location,
2176                                    "Loop costings may not be worthwhile.\n");
2177       goto again;
2178     }
2179   if (!res)
2180     return opt_result::failure_at (vect_location,
2181                                    "Loop costings not worthwhile.\n");
2182
2183   determine_peel_for_niter (loop_vinfo);
2184   /* If an epilogue loop is required make sure we can create one.  */
2185   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2186       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2187     {
2188       if (dump_enabled_p ())
2189         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2190       if (!vect_can_advance_ivs_p (loop_vinfo)
2191           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2192                                            single_exit (LOOP_VINFO_LOOP
2193                                                          (loop_vinfo))))
2194         {
2195           ok = opt_result::failure_at (vect_location,
2196                                        "not vectorized: can't create required "
2197                                        "epilog loop\n");
2198           goto again;
2199         }
2200     }
2201
2202   /* During peeling, we need to check if number of loop iterations is
2203      enough for both peeled prolog loop and vector loop.  This check
2204      can be merged along with threshold check of loop versioning, so
2205      increase threshold for this case if necessary.
2206
2207      If we are analyzing an epilogue we still want to check what its
2208      versioning threshold would be.  If we decide to vectorize the epilogues we
2209      will want to use the lowest versioning threshold of all epilogues and main
2210      loop.  This will enable us to enter a vectorized epilogue even when
2211      versioning the loop.  We can't simply check whether the epilogue requires
2212      versioning though since we may have skipped some versioning checks when
2213      analyzing the epilogue.  For instance, checks for alias versioning will be
2214      skipped when dealing with epilogues as we assume we already checked them
2215      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2216   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2217     {
2218       poly_uint64 niters_th = 0;
2219       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2220
2221       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2222         {
2223           /* Niters for peeled prolog loop.  */
2224           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2225             {
2226               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2227               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2228               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2229             }
2230           else
2231             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2232         }
2233
2234       /* Niters for at least one iteration of vectorized loop.  */
2235       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2236         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2237       /* One additional iteration because of peeling for gap.  */
2238       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2239         niters_th += 1;
2240
2241       /*  Use the same condition as vect_transform_loop to decide when to use
2242           the cost to determine a versioning threshold.  */
2243       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2244           && ordered_p (th, niters_th))
2245         niters_th = ordered_max (poly_uint64 (th), niters_th);
2246
2247       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2248     }
2249
2250   gcc_assert (known_eq (vectorization_factor,
2251                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2252
2253   /* Ok to vectorize!  */
2254   return opt_result::success ();
2255
2256 again:
2257   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2258   gcc_assert (!ok);
2259
2260   /* Try again with SLP forced off but if we didn't do any SLP there is
2261      no point in re-trying.  */
2262   if (!slp)
2263     return ok;
2264
2265   /* If there are reduction chains re-trying will fail anyway.  */
2266   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2267     return ok;
2268
2269   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2270      via interleaving or lane instructions.  */
2271   slp_instance instance;
2272   slp_tree node;
2273   unsigned i, j;
2274   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2275     {
2276       stmt_vec_info vinfo;
2277       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2278       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2279         continue;
2280       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2281       unsigned int size = DR_GROUP_SIZE (vinfo);
2282       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2283       if (! vect_store_lanes_supported (vectype, size, false)
2284          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2285          && ! vect_grouped_store_supported (vectype, size))
2286         return opt_result::failure_at (vinfo->stmt,
2287                                        "unsupported grouped store\n");
2288       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2289         {
2290           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2291           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2292           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2293           size = DR_GROUP_SIZE (vinfo);
2294           vectype = STMT_VINFO_VECTYPE (vinfo);
2295           if (! vect_load_lanes_supported (vectype, size, false)
2296               && ! vect_grouped_load_supported (vectype, single_element_p,
2297                                                 size))
2298             return opt_result::failure_at (vinfo->stmt,
2299                                            "unsupported grouped load\n");
2300         }
2301     }
2302
2303   if (dump_enabled_p ())
2304     dump_printf_loc (MSG_NOTE, vect_location,
2305                      "re-trying with SLP disabled\n");
2306
2307   /* Roll back state appropriately.  No SLP this time.  */
2308   slp = false;
2309   /* Restore vectorization factor as it were without SLP.  */
2310   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2311   /* Free the SLP instances.  */
2312   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2313     vect_free_slp_instance (instance, false);
2314   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2315   /* Reset SLP type to loop_vect on all stmts.  */
2316   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2317     {
2318       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2319       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2320            !gsi_end_p (si); gsi_next (&si))
2321         {
2322           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2323           STMT_SLP_TYPE (stmt_info) = loop_vect;
2324           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2325               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2326             {
2327               /* vectorizable_reduction adjusts reduction stmt def-types,
2328                  restore them to that of the PHI.  */
2329               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2330                 = STMT_VINFO_DEF_TYPE (stmt_info);
2331               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2332                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
2333                 = STMT_VINFO_DEF_TYPE (stmt_info);
2334             }
2335         }
2336       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2337            !gsi_end_p (si); gsi_next (&si))
2338         {
2339           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2340           STMT_SLP_TYPE (stmt_info) = loop_vect;
2341           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2342             {
2343               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2344               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2345               STMT_SLP_TYPE (stmt_info) = loop_vect;
2346               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2347                    !gsi_end_p (pi); gsi_next (&pi))
2348                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2349                   = loop_vect;
2350             }
2351         }
2352     }
2353   /* Free optimized alias test DDRS.  */
2354   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2355   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2356   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2357   /* Reset target cost data.  */
2358   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2359   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2360     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2361   /* Reset accumulated rgroup information.  */
2362   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2363   /* Reset assorted flags.  */
2364   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2365   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2366   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2367   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2368   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2369
2370   goto start_over;
2371 }
2372
2373 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2374    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2375    OLD_LOOP_VINFO is better unless something specifically indicates
2376    otherwise.
2377
2378    Note that this deliberately isn't a partial order.  */
2379
2380 static bool
2381 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2382                           loop_vec_info old_loop_vinfo)
2383 {
2384   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2385   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2386
2387   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2388   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2389
2390   /* Always prefer a VF of loop->simdlen over any other VF.  */
2391   if (loop->simdlen)
2392     {
2393       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2394       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2395       if (new_simdlen_p != old_simdlen_p)
2396         return new_simdlen_p;
2397     }
2398
2399   /* Limit the VFs to what is likely to be the maximum number of iterations,
2400      to handle cases in which at least one loop_vinfo is fully-masked.  */
2401   HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2402   if (estimated_max_niter != -1)
2403     {
2404       if (known_le (estimated_max_niter, new_vf))
2405         new_vf = estimated_max_niter;
2406       if (known_le (estimated_max_niter, old_vf))
2407         old_vf = estimated_max_niter;
2408     }
2409
2410   /* Check whether the (fractional) cost per scalar iteration is lower
2411      or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf.  */
2412   poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
2413                              * poly_widest_int (old_vf));
2414   poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
2415                              * poly_widest_int (new_vf));
2416   if (maybe_lt (rel_old, rel_new))
2417     return false;
2418   if (known_lt (rel_new, rel_old))
2419     return true;
2420
2421   /* If there's nothing to choose between the loop bodies, see whether
2422      there's a difference in the prologue and epilogue costs.  */
2423   if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2424     return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2425
2426   return false;
2427 }
2428
2429 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2430    true if we should.  */
2431
2432 static bool
2433 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2434                         loop_vec_info old_loop_vinfo)
2435 {
2436   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2437     return false;
2438
2439   if (dump_enabled_p ())
2440     dump_printf_loc (MSG_NOTE, vect_location,
2441                      "***** Preferring vector mode %s to vector mode %s\n",
2442                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
2443                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
2444   return true;
2445 }
2446
2447 /* Function vect_analyze_loop.
2448
2449    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2450    for it.  The different analyses will record information in the
2451    loop_vec_info struct.  */
2452 opt_loop_vec_info
2453 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2454 {
2455   auto_vector_modes vector_modes;
2456
2457   /* Autodetect first vector size we try.  */
2458   unsigned int autovec_flags
2459     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2460                                                     loop->simdlen != 0);
2461   unsigned int mode_i = 0;
2462
2463   DUMP_VECT_SCOPE ("analyze_loop_nest");
2464
2465   if (loop_outer (loop)
2466       && loop_vec_info_for_loop (loop_outer (loop))
2467       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2468     return opt_loop_vec_info::failure_at (vect_location,
2469                                           "outer-loop already vectorized.\n");
2470
2471   if (!find_loop_nest (loop, &shared->loop_nest))
2472     return opt_loop_vec_info::failure_at
2473       (vect_location,
2474        "not vectorized: loop nest containing two or more consecutive inner"
2475        " loops cannot be vectorized\n");
2476
2477   unsigned n_stmts = 0;
2478   machine_mode autodetected_vector_mode = VOIDmode;
2479   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2480   machine_mode next_vector_mode = VOIDmode;
2481   poly_uint64 lowest_th = 0;
2482   unsigned vectorized_loops = 0;
2483   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2484                              && !unlimited_cost_model (loop));
2485
2486   bool vect_epilogues = false;
2487   opt_result res = opt_result::success ();
2488   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2489   while (1)
2490     {
2491       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2492       opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2493       if (!loop_vinfo)
2494         {
2495           if (dump_enabled_p ())
2496             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2497                              "bad loop form.\n");
2498           gcc_checking_assert (first_loop_vinfo == NULL);
2499           return loop_vinfo;
2500         }
2501       loop_vinfo->vector_mode = next_vector_mode;
2502
2503       bool fatal = false;
2504
2505       /* When pick_lowest_cost_p is true, we should in principle iterate
2506          over all the loop_vec_infos that LOOP_VINFO could replace and
2507          try to vectorize LOOP_VINFO under the same conditions.
2508          E.g. when trying to replace an epilogue loop, we should vectorize
2509          LOOP_VINFO as an epilogue loop with the same VF limit.  When trying
2510          to replace the main loop, we should vectorize LOOP_VINFO as a main
2511          loop too.
2512
2513          However, autovectorize_vector_modes is usually sorted as follows:
2514
2515          - Modes that naturally produce lower VFs usually follow modes that
2516            naturally produce higher VFs.
2517
2518          - When modes naturally produce the same VF, maskable modes
2519            usually follow unmaskable ones, so that the maskable mode
2520            can be used to vectorize the epilogue of the unmaskable mode.
2521
2522          This order is preferred because it leads to the maximum
2523          epilogue vectorization opportunities.  Targets should only use
2524          a different order if they want to make wide modes available while
2525          disparaging them relative to earlier, smaller modes.  The assumption
2526          in that case is that the wider modes are more expensive in some
2527          way that isn't reflected directly in the costs.
2528
2529          There should therefore be few interesting cases in which
2530          LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2531          treated as a standalone loop, and ends up being genuinely cheaper
2532          than FIRST_LOOP_VINFO.  */
2533       if (vect_epilogues)
2534         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2535
2536       res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2537       if (mode_i == 0)
2538         autodetected_vector_mode = loop_vinfo->vector_mode;
2539       if (dump_enabled_p ())
2540         {
2541           if (res)
2542             dump_printf_loc (MSG_NOTE, vect_location,
2543                              "***** Analysis succeeded with vector mode %s\n",
2544                              GET_MODE_NAME (loop_vinfo->vector_mode));
2545           else
2546             dump_printf_loc (MSG_NOTE, vect_location,
2547                              "***** Analysis failed with vector mode %s\n",
2548                              GET_MODE_NAME (loop_vinfo->vector_mode));
2549         }
2550
2551       loop->aux = NULL;
2552
2553       if (!fatal)
2554         while (mode_i < vector_modes.length ()
2555                && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
2556           {
2557             if (dump_enabled_p ())
2558               dump_printf_loc (MSG_NOTE, vect_location,
2559                                "***** The result for vector mode %s would"
2560                                " be the same\n",
2561                                GET_MODE_NAME (vector_modes[mode_i]));
2562             mode_i += 1;
2563           }
2564
2565       if (res)
2566         {
2567           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2568           vectorized_loops++;
2569
2570           /* Once we hit the desired simdlen for the first time,
2571              discard any previous attempts.  */
2572           if (simdlen
2573               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2574             {
2575               delete first_loop_vinfo;
2576               first_loop_vinfo = opt_loop_vec_info::success (NULL);
2577               LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2578               simdlen = 0;
2579             }
2580           else if (pick_lowest_cost_p && first_loop_vinfo)
2581             {
2582               /* Keep trying to roll back vectorization attempts while the
2583                  loop_vec_infos they produced were worse than this one.  */
2584               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
2585               while (!vinfos.is_empty ()
2586                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
2587                 {
2588                   gcc_assert (vect_epilogues);
2589                   delete vinfos.pop ();
2590                 }
2591               if (vinfos.is_empty ()
2592                   && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2593                 {
2594                   delete first_loop_vinfo;
2595                   first_loop_vinfo = opt_loop_vec_info::success (NULL);
2596                   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2597                 }
2598             }
2599
2600           if (first_loop_vinfo == NULL)
2601             {
2602               first_loop_vinfo = loop_vinfo;
2603               lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2604             }
2605           else if (vect_epilogues
2606                    /* For now only allow one epilogue loop.  */
2607                    && first_loop_vinfo->epilogue_vinfos.is_empty ())
2608             {
2609               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2610               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
2611               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2612                           || maybe_ne (lowest_th, 0U));
2613               /* Keep track of the known smallest versioning
2614                  threshold.  */
2615               if (ordered_p (lowest_th, th))
2616                 lowest_th = ordered_min (lowest_th, th);
2617             }
2618           else
2619             delete loop_vinfo;
2620
2621           /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2622              enabled, SIMDUID is not set, it is the innermost loop and we have
2623              either already found the loop's SIMDLEN or there was no SIMDLEN to
2624              begin with.
2625              TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
2626           vect_epilogues = (!simdlen
2627                             && loop->inner == NULL
2628                             && param_vect_epilogues_nomask
2629                             && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
2630                             && !loop->simduid
2631                             /* For now only allow one epilogue loop, but allow
2632                                pick_lowest_cost_p to replace it.  */
2633                             && (first_loop_vinfo->epilogue_vinfos.is_empty ()
2634                                 || pick_lowest_cost_p));
2635
2636           /* Commit to first_loop_vinfo if we have no reason to try
2637              alternatives.  */
2638           if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
2639             break;
2640         }
2641       else
2642         {
2643           delete loop_vinfo;
2644           if (fatal)
2645             {
2646               gcc_checking_assert (first_loop_vinfo == NULL);
2647               break;
2648             }
2649         }
2650
2651       if (mode_i < vector_modes.length ()
2652           && VECTOR_MODE_P (autodetected_vector_mode)
2653           && (related_vector_mode (vector_modes[mode_i],
2654                                    GET_MODE_INNER (autodetected_vector_mode))
2655               == autodetected_vector_mode)
2656           && (related_vector_mode (autodetected_vector_mode,
2657                                    GET_MODE_INNER (vector_modes[mode_i]))
2658               == vector_modes[mode_i]))
2659         {
2660           if (dump_enabled_p ())
2661             dump_printf_loc (MSG_NOTE, vect_location,
2662                              "***** Skipping vector mode %s, which would"
2663                              " repeat the analysis for %s\n",
2664                              GET_MODE_NAME (vector_modes[mode_i]),
2665                              GET_MODE_NAME (autodetected_vector_mode));
2666           mode_i += 1;
2667         }
2668
2669       if (mode_i == vector_modes.length ()
2670           || autodetected_vector_mode == VOIDmode)
2671         break;
2672
2673       /* Try the next biggest vector size.  */
2674       next_vector_mode = vector_modes[mode_i++];
2675       if (dump_enabled_p ())
2676         dump_printf_loc (MSG_NOTE, vect_location,
2677                          "***** Re-trying analysis with vector mode %s\n",
2678                          GET_MODE_NAME (next_vector_mode));
2679     }
2680
2681   if (first_loop_vinfo)
2682     {
2683       loop->aux = (loop_vec_info) first_loop_vinfo;
2684       if (dump_enabled_p ())
2685         dump_printf_loc (MSG_NOTE, vect_location,
2686                          "***** Choosing vector mode %s\n",
2687                          GET_MODE_NAME (first_loop_vinfo->vector_mode));
2688       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
2689       return first_loop_vinfo;
2690     }
2691
2692   return opt_loop_vec_info::propagate_failure (res);
2693 }
2694
2695 /* Return true if there is an in-order reduction function for CODE, storing
2696    it in *REDUC_FN if so.  */
2697
2698 static bool
2699 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2700 {
2701   switch (code)
2702     {
2703     case PLUS_EXPR:
2704       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2705       return true;
2706
2707     default:
2708       return false;
2709     }
2710 }
2711
2712 /* Function reduction_fn_for_scalar_code
2713
2714    Input:
2715    CODE - tree_code of a reduction operations.
2716
2717    Output:
2718    REDUC_FN - the corresponding internal function to be used to reduce the
2719       vector of partial results into a single scalar result, or IFN_LAST
2720       if the operation is a supported reduction operation, but does not have
2721       such an internal function.
2722
2723    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2724
2725 static bool
2726 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2727 {
2728   switch (code)
2729     {
2730       case MAX_EXPR:
2731         *reduc_fn = IFN_REDUC_MAX;
2732         return true;
2733
2734       case MIN_EXPR:
2735         *reduc_fn = IFN_REDUC_MIN;
2736         return true;
2737
2738       case PLUS_EXPR:
2739         *reduc_fn = IFN_REDUC_PLUS;
2740         return true;
2741
2742       case BIT_AND_EXPR:
2743         *reduc_fn = IFN_REDUC_AND;
2744         return true;
2745
2746       case BIT_IOR_EXPR:
2747         *reduc_fn = IFN_REDUC_IOR;
2748         return true;
2749
2750       case BIT_XOR_EXPR:
2751         *reduc_fn = IFN_REDUC_XOR;
2752         return true;
2753
2754       case MULT_EXPR:
2755       case MINUS_EXPR:
2756         *reduc_fn = IFN_LAST;
2757         return true;
2758
2759       default:
2760        return false;
2761     }
2762 }
2763
2764 /* If there is a neutral value X such that SLP reduction NODE would not
2765    be affected by the introduction of additional X elements, return that X,
2766    otherwise return null.  CODE is the code of the reduction and VECTOR_TYPE
2767    is the vector type that would hold element X.  REDUC_CHAIN is true if
2768    the SLP statements perform a single reduction, false if each statement
2769    performs an independent reduction.  */
2770
2771 static tree
2772 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
2773                               tree_code code, bool reduc_chain)
2774 {
2775   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2776   stmt_vec_info stmt_vinfo = stmts[0];
2777   tree scalar_type = TREE_TYPE (vector_type);
2778   class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2779   gcc_assert (loop);
2780
2781   switch (code)
2782     {
2783     case WIDEN_SUM_EXPR:
2784     case DOT_PROD_EXPR:
2785     case SAD_EXPR:
2786     case PLUS_EXPR:
2787     case MINUS_EXPR:
2788     case BIT_IOR_EXPR:
2789     case BIT_XOR_EXPR:
2790       return build_zero_cst (scalar_type);
2791
2792     case MULT_EXPR:
2793       return build_one_cst (scalar_type);
2794
2795     case BIT_AND_EXPR:
2796       return build_all_ones_cst (scalar_type);
2797
2798     case MAX_EXPR:
2799     case MIN_EXPR:
2800       /* For MIN/MAX the initial values are neutral.  A reduction chain
2801          has only a single initial value, so that value is neutral for
2802          all statements.  */
2803       if (reduc_chain)
2804         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2805                                       loop_preheader_edge (loop));
2806       return NULL_TREE;
2807
2808     default:
2809       return NULL_TREE;
2810     }
2811 }
2812
2813 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2814    STMT is printed with a message MSG. */
2815
2816 static void
2817 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2818 {
2819   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2820 }
2821
2822 /* Return true if we need an in-order reduction for operation CODE
2823    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2824    overflow must wrap.  */
2825
2826 bool
2827 needs_fold_left_reduction_p (tree type, tree_code code)
2828 {
2829   /* CHECKME: check for !flag_finite_math_only too?  */
2830   if (SCALAR_FLOAT_TYPE_P (type))
2831     switch (code)
2832       {
2833       case MIN_EXPR:
2834       case MAX_EXPR:
2835         return false;
2836
2837       default:
2838         return !flag_associative_math;
2839       }
2840
2841   if (INTEGRAL_TYPE_P (type))
2842     {
2843       if (!operation_no_trapping_overflow (type, code))
2844         return true;
2845       return false;
2846     }
2847
2848   if (SAT_FIXED_POINT_TYPE_P (type))
2849     return true;
2850
2851   return false;
2852 }
2853
2854 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2855    has a handled computation expression.  Store the main reduction
2856    operation in *CODE.  */
2857
2858 static bool
2859 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2860                       tree loop_arg, enum tree_code *code,
2861                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
2862 {
2863   auto_bitmap visited;
2864   tree lookfor = PHI_RESULT (phi);
2865   ssa_op_iter curri;
2866   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2867   while (USE_FROM_PTR (curr) != loop_arg)
2868     curr = op_iter_next_use (&curri);
2869   curri.i = curri.numops;
2870   do
2871     {
2872       path.safe_push (std::make_pair (curri, curr));
2873       tree use = USE_FROM_PTR (curr);
2874       if (use == lookfor)
2875         break;
2876       gimple *def = SSA_NAME_DEF_STMT (use);
2877       if (gimple_nop_p (def)
2878           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2879         {
2880 pop:
2881           do
2882             {
2883               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2884               curri = x.first;
2885               curr = x.second;
2886               do
2887                 curr = op_iter_next_use (&curri);
2888               /* Skip already visited or non-SSA operands (from iterating
2889                  over PHI args).  */
2890               while (curr != NULL_USE_OPERAND_P
2891                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2892                          || ! bitmap_set_bit (visited,
2893                                               SSA_NAME_VERSION
2894                                                 (USE_FROM_PTR (curr)))));
2895             }
2896           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2897           if (curr == NULL_USE_OPERAND_P)
2898             break;
2899         }
2900       else
2901         {
2902           if (gimple_code (def) == GIMPLE_PHI)
2903             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2904           else
2905             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2906           while (curr != NULL_USE_OPERAND_P
2907                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2908                      || ! bitmap_set_bit (visited,
2909                                           SSA_NAME_VERSION
2910                                             (USE_FROM_PTR (curr)))))
2911             curr = op_iter_next_use (&curri);
2912           if (curr == NULL_USE_OPERAND_P)
2913             goto pop;
2914         }
2915     }
2916   while (1);
2917   if (dump_file && (dump_flags & TDF_DETAILS))
2918     {
2919       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2920       unsigned i;
2921       std::pair<ssa_op_iter, use_operand_p> *x;
2922       FOR_EACH_VEC_ELT (path, i, x)
2923         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2924       dump_printf (MSG_NOTE, "\n");
2925     }
2926
2927   /* Check whether the reduction path detected is valid.  */
2928   bool fail = path.length () == 0;
2929   bool neg = false;
2930   int sign = -1;
2931   *code = ERROR_MARK;
2932   for (unsigned i = 1; i < path.length (); ++i)
2933     {
2934       gimple *use_stmt = USE_STMT (path[i].second);
2935       tree op = USE_FROM_PTR (path[i].second);
2936       if (! is_gimple_assign (use_stmt)
2937           /* The following make sure we can compute the operand index
2938              easily plus it mostly disallows chaining via COND_EXPR condition
2939              operands.  */
2940           || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
2941               && (gimple_num_ops (use_stmt) <= 2
2942                   || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
2943               && (gimple_num_ops (use_stmt) <= 3
2944                   || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
2945         {
2946           fail = true;
2947           break;
2948         }
2949       /* Check there's only a single stmt the op is used on inside
2950          of the loop.  */
2951       imm_use_iterator imm_iter;
2952       gimple *op_use_stmt;
2953       unsigned cnt = 0;
2954       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
2955         if (!is_gimple_debug (op_use_stmt)
2956             && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
2957           {
2958             /* We want to allow x + x but not x < 1 ? x : 2.  */
2959             if (is_gimple_assign (op_use_stmt)
2960                 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
2961               {
2962                 use_operand_p use_p;
2963                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2964                   cnt++;
2965               }
2966             else
2967               cnt++;
2968           }
2969       if (cnt != 1)
2970         {
2971           fail = true;
2972           break;
2973         }
2974       tree_code use_code = gimple_assign_rhs_code (use_stmt);
2975       if (use_code == MINUS_EXPR)
2976         {
2977           use_code = PLUS_EXPR;
2978           /* Track whether we negate the reduction value each iteration.  */
2979           if (gimple_assign_rhs2 (use_stmt) == op)
2980             neg = ! neg;
2981         }
2982       if (CONVERT_EXPR_CODE_P (use_code)
2983           && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
2984                                     TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
2985         ;
2986       else if (*code == ERROR_MARK)
2987         {
2988           *code = use_code;
2989           sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
2990         }
2991       else if (use_code != *code)
2992         {
2993           fail = true;
2994           break;
2995         }
2996       else if ((use_code == MIN_EXPR
2997                 || use_code == MAX_EXPR)
2998                && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
2999         {
3000           fail = true;
3001           break;
3002         }
3003     }
3004   return ! fail && ! neg && *code != ERROR_MARK;
3005 }
3006
3007 bool
3008 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3009                       tree loop_arg, enum tree_code code)
3010 {
3011   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3012   enum tree_code code_;
3013   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3014           && code_ == code);
3015 }
3016
3017
3018
3019 /* Function vect_is_simple_reduction
3020
3021    (1) Detect a cross-iteration def-use cycle that represents a simple
3022    reduction computation.  We look for the following pattern:
3023
3024    loop_header:
3025      a1 = phi < a0, a2 >
3026      a3 = ...
3027      a2 = operation (a3, a1)
3028
3029    or
3030
3031    a3 = ...
3032    loop_header:
3033      a1 = phi < a0, a2 >
3034      a2 = operation (a3, a1)
3035
3036    such that:
3037    1. operation is commutative and associative and it is safe to
3038       change the order of the computation
3039    2. no uses for a2 in the loop (a2 is used out of the loop)
3040    3. no uses of a1 in the loop besides the reduction operation
3041    4. no uses of a1 outside the loop.
3042
3043    Conditions 1,4 are tested here.
3044    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3045
3046    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3047    nested cycles.
3048
3049    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3050    reductions:
3051
3052      a1 = phi < a0, a2 >
3053      inner loop (def of a3)
3054      a2 = phi < a3 >
3055
3056    (4) Detect condition expressions, ie:
3057      for (int i = 0; i < N; i++)
3058        if (a[i] < val)
3059         ret_val = a[i];
3060
3061 */
3062
3063 static stmt_vec_info
3064 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3065                           bool *double_reduc, bool *reduc_chain_p)
3066 {
3067   gphi *phi = as_a <gphi *> (phi_info->stmt);
3068   gimple *phi_use_stmt = NULL;
3069   imm_use_iterator imm_iter;
3070   use_operand_p use_p;
3071
3072   *double_reduc = false;
3073   *reduc_chain_p = false;
3074   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3075
3076   tree phi_name = PHI_RESULT (phi);
3077   /* ???  If there are no uses of the PHI result the inner loop reduction
3078      won't be detected as possibly double-reduction by vectorizable_reduction
3079      because that tries to walk the PHI arg from the preheader edge which
3080      can be constant.  See PR60382.  */
3081   if (has_zero_uses (phi_name))
3082     return NULL;
3083   class loop *loop = (gimple_bb (phi))->loop_father;
3084   unsigned nphi_def_loop_uses = 0;
3085   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3086     {
3087       gimple *use_stmt = USE_STMT (use_p);
3088       if (is_gimple_debug (use_stmt))
3089         continue;
3090
3091       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3092         {
3093           if (dump_enabled_p ())
3094             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3095                              "intermediate value used outside loop.\n");
3096
3097           return NULL;
3098         }
3099
3100       nphi_def_loop_uses++;
3101       phi_use_stmt = use_stmt;
3102     }
3103
3104   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3105   if (TREE_CODE (latch_def) != SSA_NAME)
3106     {
3107       if (dump_enabled_p ())
3108         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3109                          "reduction: not ssa_name: %T\n", latch_def);
3110       return NULL;
3111     }
3112
3113   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3114   if (!def_stmt_info
3115       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3116     return NULL;
3117
3118   bool nested_in_vect_loop
3119     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3120   unsigned nlatch_def_loop_uses = 0;
3121   auto_vec<gphi *, 3> lcphis;
3122   bool inner_loop_of_double_reduc = false;
3123   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3124     {
3125       gimple *use_stmt = USE_STMT (use_p);
3126       if (is_gimple_debug (use_stmt))
3127         continue;
3128       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3129         nlatch_def_loop_uses++;
3130       else
3131         {
3132           /* We can have more than one loop-closed PHI.  */
3133           lcphis.safe_push (as_a <gphi *> (use_stmt));
3134           if (nested_in_vect_loop
3135               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3136                   == vect_double_reduction_def))
3137             inner_loop_of_double_reduc = true;
3138         }
3139     }
3140
3141   /* If we are vectorizing an inner reduction we are executing that
3142      in the original order only in case we are not dealing with a
3143      double reduction.  */
3144   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3145     {
3146       if (dump_enabled_p ())
3147         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3148                         "detected nested cycle: ");
3149       return def_stmt_info;
3150     }
3151
3152   /* If this isn't a nested cycle or if the nested cycle reduction value
3153      is used ouside of the inner loop we cannot handle uses of the reduction
3154      value.  */
3155   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3156     {
3157       if (dump_enabled_p ())
3158         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3159                          "reduction used in loop.\n");
3160       return NULL;
3161     }
3162
3163   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3164      defined in the inner loop.  */
3165   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3166     {
3167       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3168       if (gimple_phi_num_args (def_stmt) != 1
3169           || TREE_CODE (op1) != SSA_NAME)
3170         {
3171           if (dump_enabled_p ())
3172             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3173                              "unsupported phi node definition.\n");
3174
3175           return NULL;
3176         }
3177
3178       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3179       if (gimple_bb (def1)
3180           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3181           && loop->inner
3182           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3183           && is_gimple_assign (def1)
3184           && is_a <gphi *> (phi_use_stmt)
3185           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3186         {
3187           if (dump_enabled_p ())
3188             report_vect_op (MSG_NOTE, def_stmt,
3189                             "detected double reduction: ");
3190
3191           *double_reduc = true;
3192           return def_stmt_info;
3193         }
3194
3195       return NULL;
3196     }
3197
3198   /* Look for the expression computing latch_def from then loop PHI result.  */
3199   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3200   enum tree_code code;
3201   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3202                             path))
3203     {
3204       STMT_VINFO_REDUC_CODE (phi_info) = code;
3205       if (code == COND_EXPR && !nested_in_vect_loop)
3206         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3207
3208       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3209          reduction chain for which the additional restriction is that
3210          all operations in the chain are the same.  */
3211       auto_vec<stmt_vec_info, 8> reduc_chain;
3212       unsigned i;
3213       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3214       for (i = path.length () - 1; i >= 1; --i)
3215         {
3216           gimple *stmt = USE_STMT (path[i].second);
3217           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3218           STMT_VINFO_REDUC_IDX (stmt_info)
3219             = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3220           enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3221           bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3222                                      && (i == 1 || i == path.length () - 1));
3223           if ((stmt_code != code && !leading_conversion)
3224               /* We can only handle the final value in epilogue
3225                  generation for reduction chains.  */
3226               || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3227             is_slp_reduc = false;
3228           /* For reduction chains we support a trailing/leading
3229              conversions.  We do not store those in the actual chain.  */
3230           if (leading_conversion)
3231             continue;
3232           reduc_chain.safe_push (stmt_info);
3233         }
3234       if (is_slp_reduc && reduc_chain.length () > 1)
3235         {
3236           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3237             {
3238               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3239               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3240             }
3241           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3242           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3243
3244           /* Save the chain for further analysis in SLP detection.  */
3245           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3246           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3247
3248           *reduc_chain_p = true;
3249           if (dump_enabled_p ())
3250             dump_printf_loc (MSG_NOTE, vect_location,
3251                             "reduction: detected reduction chain\n");
3252         }
3253       else if (dump_enabled_p ())
3254         dump_printf_loc (MSG_NOTE, vect_location,
3255                          "reduction: detected reduction\n");
3256
3257       return def_stmt_info;
3258     }
3259
3260   if (dump_enabled_p ())
3261     dump_printf_loc (MSG_NOTE, vect_location,
3262                      "reduction: unknown pattern\n");
3263
3264   return NULL;
3265 }
3266
3267 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3268 int
3269 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3270                              int *peel_iters_epilogue,
3271                              stmt_vector_for_cost *scalar_cost_vec,
3272                              stmt_vector_for_cost *prologue_cost_vec,
3273                              stmt_vector_for_cost *epilogue_cost_vec)
3274 {
3275   int retval = 0;
3276   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3277
3278   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3279     {
3280       *peel_iters_epilogue = assumed_vf / 2;
3281       if (dump_enabled_p ())
3282         dump_printf_loc (MSG_NOTE, vect_location,
3283                          "cost model: epilogue peel iters set to vf/2 "
3284                          "because loop iterations are unknown .\n");
3285
3286       /* If peeled iterations are known but number of scalar loop
3287          iterations are unknown, count a taken branch per peeled loop.  */
3288       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3289                                  NULL, 0, vect_prologue);
3290       retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3291                                   NULL, 0, vect_epilogue);
3292     }
3293   else
3294     {
3295       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3296       peel_iters_prologue = niters < peel_iters_prologue ?
3297                             niters : peel_iters_prologue;
3298       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3299       /* If we need to peel for gaps, but no peeling is required, we have to
3300          peel VF iterations.  */
3301       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3302         *peel_iters_epilogue = assumed_vf;
3303     }
3304
3305   stmt_info_for_cost *si;
3306   int j;
3307   if (peel_iters_prologue)
3308     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3309       retval += record_stmt_cost (prologue_cost_vec,
3310                                   si->count * peel_iters_prologue,
3311                                   si->kind, si->stmt_info, si->misalign,
3312                                   vect_prologue);
3313   if (*peel_iters_epilogue)
3314     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3315       retval += record_stmt_cost (epilogue_cost_vec,
3316                                   si->count * *peel_iters_epilogue,
3317                                   si->kind, si->stmt_info, si->misalign,
3318                                   vect_epilogue);
3319
3320   return retval;
3321 }
3322
3323 /* Function vect_estimate_min_profitable_iters
3324
3325    Return the number of iterations required for the vector version of the
3326    loop to be profitable relative to the cost of the scalar version of the
3327    loop.
3328
3329    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3330    of iterations for vectorization.  -1 value means loop vectorization
3331    is not profitable.  This returned value may be used for dynamic
3332    profitability check.
3333
3334    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3335    for static check against estimated number of iterations.  */
3336
3337 static void
3338 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3339                                     int *ret_min_profitable_niters,
3340                                     int *ret_min_profitable_estimate)
3341 {
3342   int min_profitable_iters;
3343   int min_profitable_estimate;
3344   int peel_iters_prologue;
3345   int peel_iters_epilogue;
3346   unsigned vec_inside_cost = 0;
3347   int vec_outside_cost = 0;
3348   unsigned vec_prologue_cost = 0;
3349   unsigned vec_epilogue_cost = 0;
3350   int scalar_single_iter_cost = 0;
3351   int scalar_outside_cost = 0;
3352   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3353   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3354   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3355
3356   /* Cost model disabled.  */
3357   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3358     {
3359       if (dump_enabled_p ())
3360         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3361       *ret_min_profitable_niters = 0;
3362       *ret_min_profitable_estimate = 0;
3363       return;
3364     }
3365
3366   /* Requires loop versioning tests to handle misalignment.  */
3367   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3368     {
3369       /*  FIXME: Make cost depend on complexity of individual check.  */
3370       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3371       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3372                             vect_prologue);
3373       if (dump_enabled_p ())
3374         dump_printf (MSG_NOTE,
3375                      "cost model: Adding cost of checks for loop "
3376                      "versioning to treat misalignment.\n");
3377     }
3378
3379   /* Requires loop versioning with alias checks.  */
3380   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3381     {
3382       /*  FIXME: Make cost depend on complexity of individual check.  */
3383       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3384       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3385                             vect_prologue);
3386       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3387       if (len)
3388         /* Count LEN - 1 ANDs and LEN comparisons.  */
3389         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3390                               NULL, 0, vect_prologue);
3391       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3392       if (len)
3393         {
3394           /* Count LEN - 1 ANDs and LEN comparisons.  */
3395           unsigned int nstmts = len * 2 - 1;
3396           /* +1 for each bias that needs adding.  */
3397           for (unsigned int i = 0; i < len; ++i)
3398             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3399               nstmts += 1;
3400           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3401                                 NULL, 0, vect_prologue);
3402         }
3403       if (dump_enabled_p ())
3404         dump_printf (MSG_NOTE,
3405                      "cost model: Adding cost of checks for loop "
3406                      "versioning aliasing.\n");
3407     }
3408
3409   /* Requires loop versioning with niter checks.  */
3410   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3411     {
3412       /*  FIXME: Make cost depend on complexity of individual check.  */
3413       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3414                             vect_prologue);
3415       if (dump_enabled_p ())
3416         dump_printf (MSG_NOTE,
3417                      "cost model: Adding cost of checks for loop "
3418                      "versioning niters.\n");
3419     }
3420
3421   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3422     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3423                           vect_prologue);
3424
3425   /* Count statements in scalar loop.  Using this as scalar cost for a single
3426      iteration for now.
3427
3428      TODO: Add outer loop support.
3429
3430      TODO: Consider assigning different costs to different scalar
3431      statements.  */
3432
3433   scalar_single_iter_cost
3434     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3435
3436   /* Add additional cost for the peeled instructions in prologue and epilogue
3437      loop.  (For fully-masked loops there will be no peeling.)
3438
3439      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3440      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3441
3442      TODO: Build an expression that represents peel_iters for prologue and
3443      epilogue to be used in a run-time test.  */
3444
3445   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3446     {
3447       peel_iters_prologue = 0;
3448       peel_iters_epilogue = 0;
3449
3450       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3451         {
3452           /* We need to peel exactly one iteration.  */
3453           peel_iters_epilogue += 1;
3454           stmt_info_for_cost *si;
3455           int j;
3456           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3457                             j, si)
3458             (void) add_stmt_cost (target_cost_data, si->count,
3459                                   si->kind, si->stmt_info, si->misalign,
3460                                   vect_epilogue);
3461         }
3462
3463       /* Calculate how many masks we need to generate.  */
3464       unsigned int num_masks = 0;
3465       rgroup_masks *rgm;
3466       unsigned int num_vectors_m1;
3467       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3468         if (rgm->mask_type)
3469           num_masks += num_vectors_m1 + 1;
3470       gcc_assert (num_masks > 0);
3471
3472       /* In the worst case, we need to generate each mask in the prologue
3473          and in the loop body.  One of the loop body mask instructions
3474          replaces the comparison in the scalar loop, and since we don't
3475          count the scalar comparison against the scalar body, we shouldn't
3476          count that vector instruction against the vector body either.
3477
3478          Sometimes we can use unpacks instead of generating prologue
3479          masks and sometimes the prologue mask will fold to a constant,
3480          so the actual prologue cost might be smaller.  However, it's
3481          simpler and safer to use the worst-case cost; if this ends up
3482          being the tie-breaker between vectorizing or not, then it's
3483          probably better not to vectorize.  */
3484       (void) add_stmt_cost (target_cost_data, num_masks, vector_stmt,
3485                             NULL, 0, vect_prologue);
3486       (void) add_stmt_cost (target_cost_data, num_masks - 1, vector_stmt,
3487                             NULL, 0, vect_body);
3488     }
3489   else if (npeel < 0)
3490     {
3491       peel_iters_prologue = assumed_vf / 2;
3492       if (dump_enabled_p ())
3493         dump_printf (MSG_NOTE, "cost model: "
3494                      "prologue peel iters set to vf/2.\n");
3495
3496       /* If peeling for alignment is unknown, loop bound of main loop becomes
3497          unknown.  */
3498       peel_iters_epilogue = assumed_vf / 2;
3499       if (dump_enabled_p ())
3500         dump_printf (MSG_NOTE, "cost model: "
3501                      "epilogue peel iters set to vf/2 because "
3502                      "peeling for alignment is unknown.\n");
3503
3504       /* If peeled iterations are unknown, count a taken branch and a not taken
3505          branch per peeled loop. Even if scalar loop iterations are known,
3506          vector iterations are not known since peeled prologue iterations are
3507          not known. Hence guards remain the same.  */
3508       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3509                             NULL, 0, vect_prologue);
3510       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3511                             NULL, 0, vect_prologue);
3512       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3513                             NULL, 0, vect_epilogue);
3514       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3515                             NULL, 0, vect_epilogue);
3516       stmt_info_for_cost *si;
3517       int j;
3518       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3519         {
3520           (void) add_stmt_cost (target_cost_data,
3521                                 si->count * peel_iters_prologue,
3522                                 si->kind, si->stmt_info, si->misalign,
3523                                 vect_prologue);
3524           (void) add_stmt_cost (target_cost_data,
3525                                 si->count * peel_iters_epilogue,
3526                                 si->kind, si->stmt_info, si->misalign,
3527                                 vect_epilogue);
3528         }
3529     }
3530   else
3531     {
3532       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3533       stmt_info_for_cost *si;
3534       int j;
3535       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3536
3537       prologue_cost_vec.create (2);
3538       epilogue_cost_vec.create (2);
3539       peel_iters_prologue = npeel;
3540
3541       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3542                                           &peel_iters_epilogue,
3543                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3544                                             (loop_vinfo),
3545                                           &prologue_cost_vec,
3546                                           &epilogue_cost_vec);
3547
3548       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3549         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3550                               si->misalign, vect_prologue);
3551
3552       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3553         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3554                               si->misalign, vect_epilogue);
3555
3556       prologue_cost_vec.release ();
3557       epilogue_cost_vec.release ();
3558     }
3559
3560   /* FORNOW: The scalar outside cost is incremented in one of the
3561      following ways:
3562
3563      1. The vectorizer checks for alignment and aliasing and generates
3564      a condition that allows dynamic vectorization.  A cost model
3565      check is ANDED with the versioning condition.  Hence scalar code
3566      path now has the added cost of the versioning check.
3567
3568        if (cost > th & versioning_check)
3569          jmp to vector code
3570
3571      Hence run-time scalar is incremented by not-taken branch cost.
3572
3573      2. The vectorizer then checks if a prologue is required.  If the
3574      cost model check was not done before during versioning, it has to
3575      be done before the prologue check.
3576
3577        if (cost <= th)
3578          prologue = scalar_iters
3579        if (prologue == 0)
3580          jmp to vector code
3581        else
3582          execute prologue
3583        if (prologue == num_iters)
3584          go to exit
3585
3586      Hence the run-time scalar cost is incremented by a taken branch,
3587      plus a not-taken branch, plus a taken branch cost.
3588
3589      3. The vectorizer then checks if an epilogue is required.  If the
3590      cost model check was not done before during prologue check, it
3591      has to be done with the epilogue check.
3592
3593        if (prologue == 0)
3594          jmp to vector code
3595        else
3596          execute prologue
3597        if (prologue == num_iters)
3598          go to exit
3599        vector code:
3600          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3601            jmp to epilogue
3602
3603      Hence the run-time scalar cost should be incremented by 2 taken
3604      branches.
3605
3606      TODO: The back end may reorder the BBS's differently and reverse
3607      conditions/branch directions.  Change the estimates below to
3608      something more reasonable.  */
3609
3610   /* If the number of iterations is known and we do not do versioning, we can
3611      decide whether to vectorize at compile time.  Hence the scalar version
3612      do not carry cost model guard costs.  */
3613   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3614       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3615     {
3616       /* Cost model check occurs at versioning.  */
3617       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3618         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3619       else
3620         {
3621           /* Cost model check occurs at prologue generation.  */
3622           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3623             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3624               + vect_get_stmt_cost (cond_branch_not_taken);
3625           /* Cost model check occurs at epilogue generation.  */
3626           else
3627             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3628         }
3629     }
3630
3631   /* Complete the target-specific cost calculations.  */
3632   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3633                &vec_inside_cost, &vec_epilogue_cost);
3634
3635   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3636
3637   /* Stash the costs so that we can compare two loop_vec_infos.  */
3638   loop_vinfo->vec_inside_cost = vec_inside_cost;
3639   loop_vinfo->vec_outside_cost = vec_outside_cost;
3640
3641   if (dump_enabled_p ())
3642     {
3643       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3644       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3645                    vec_inside_cost);
3646       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3647                    vec_prologue_cost);
3648       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3649                    vec_epilogue_cost);
3650       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3651                    scalar_single_iter_cost);
3652       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3653                    scalar_outside_cost);
3654       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3655                    vec_outside_cost);
3656       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3657                    peel_iters_prologue);
3658       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3659                    peel_iters_epilogue);
3660     }
3661
3662   /* Calculate number of iterations required to make the vector version
3663      profitable, relative to the loop bodies only.  The following condition
3664      must hold true:
3665      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3666      where
3667      SIC = scalar iteration cost, VIC = vector iteration cost,
3668      VOC = vector outside cost, VF = vectorization factor,
3669      NPEEL = prologue iterations + epilogue iterations,
3670      SOC = scalar outside cost for run time cost model check.  */
3671
3672   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3673                           - vec_inside_cost);
3674   if (saving_per_viter <= 0)
3675     {
3676       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3677         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3678                     "vectorization did not happen for a simd loop");
3679
3680       if (dump_enabled_p ())
3681         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3682                          "cost model: the vector iteration cost = %d "
3683                          "divided by the scalar iteration cost = %d "
3684                          "is greater or equal to the vectorization factor = %d"
3685                          ".\n",
3686                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3687       *ret_min_profitable_niters = -1;
3688       *ret_min_profitable_estimate = -1;
3689       return;
3690     }
3691
3692   /* ??? The "if" arm is written to handle all cases; see below for what
3693      we would do for !LOOP_VINFO_FULLY_MASKED_P.  */
3694   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3695     {
3696       /* Rewriting the condition above in terms of the number of
3697          vector iterations (vniters) rather than the number of
3698          scalar iterations (niters) gives:
3699
3700          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3701
3702          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3703
3704          For integer N, X and Y when X > 0:
3705
3706          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
3707       int outside_overhead = (vec_outside_cost
3708                               - scalar_single_iter_cost * peel_iters_prologue
3709                               - scalar_single_iter_cost * peel_iters_epilogue
3710                               - scalar_outside_cost);
3711       /* We're only interested in cases that require at least one
3712          vector iteration.  */
3713       int min_vec_niters = 1;
3714       if (outside_overhead > 0)
3715         min_vec_niters = outside_overhead / saving_per_viter + 1;
3716
3717       if (dump_enabled_p ())
3718         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
3719                      min_vec_niters);
3720
3721       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3722         {
3723           /* Now that we know the minimum number of vector iterations,
3724              find the minimum niters for which the scalar cost is larger:
3725
3726              SIC * niters > VIC * vniters + VOC - SOC
3727
3728              We know that the minimum niters is no more than
3729              vniters * VF + NPEEL, but it might be (and often is) less
3730              than that if a partial vector iteration is cheaper than the
3731              equivalent scalar code.  */
3732           int threshold = (vec_inside_cost * min_vec_niters
3733                            + vec_outside_cost
3734                            - scalar_outside_cost);
3735           if (threshold <= 0)
3736             min_profitable_iters = 1;
3737           else
3738             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3739         }
3740       else
3741         /* Convert the number of vector iterations into a number of
3742            scalar iterations.  */
3743         min_profitable_iters = (min_vec_niters * assumed_vf
3744                                 + peel_iters_prologue
3745                                 + peel_iters_epilogue);
3746     }
3747   else
3748     {
3749       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3750                               * assumed_vf
3751                               - vec_inside_cost * peel_iters_prologue
3752                               - vec_inside_cost * peel_iters_epilogue);
3753       if (min_profitable_iters <= 0)
3754         min_profitable_iters = 0;
3755       else
3756         {
3757           min_profitable_iters /= saving_per_viter;
3758
3759           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3760               <= (((int) vec_inside_cost * min_profitable_iters)
3761                   + (((int) vec_outside_cost - scalar_outside_cost)
3762                      * assumed_vf)))
3763             min_profitable_iters++;
3764         }
3765     }
3766
3767   if (dump_enabled_p ())
3768     dump_printf (MSG_NOTE,
3769                  "  Calculated minimum iters for profitability: %d\n",
3770                  min_profitable_iters);
3771
3772   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3773       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3774     /* We want the vectorized loop to execute at least once.  */
3775     min_profitable_iters = assumed_vf + peel_iters_prologue;
3776
3777   if (dump_enabled_p ())
3778     dump_printf_loc (MSG_NOTE, vect_location,
3779                      "  Runtime profitability threshold = %d\n",
3780                      min_profitable_iters);
3781
3782   *ret_min_profitable_niters = min_profitable_iters;
3783
3784   /* Calculate number of iterations required to make the vector version
3785      profitable, relative to the loop bodies only.
3786
3787      Non-vectorized variant is SIC * niters and it must win over vector
3788      variant on the expected loop trip count.  The following condition must hold true:
3789      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
3790
3791   if (vec_outside_cost <= 0)
3792     min_profitable_estimate = 0;
3793   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3794     {
3795       /* This is a repeat of the code above, but with + SOC rather
3796          than - SOC.  */
3797       int outside_overhead = (vec_outside_cost
3798                               - scalar_single_iter_cost * peel_iters_prologue
3799                               - scalar_single_iter_cost * peel_iters_epilogue
3800                               + scalar_outside_cost);
3801       int min_vec_niters = 1;
3802       if (outside_overhead > 0)
3803         min_vec_niters = outside_overhead / saving_per_viter + 1;
3804
3805       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3806         {
3807           int threshold = (vec_inside_cost * min_vec_niters
3808                            + vec_outside_cost
3809                            + scalar_outside_cost);
3810           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3811         }
3812       else
3813         min_profitable_estimate = (min_vec_niters * assumed_vf
3814                                    + peel_iters_prologue
3815                                    + peel_iters_epilogue);
3816     }
3817   else
3818     {
3819       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3820                                  * assumed_vf
3821                                  - vec_inside_cost * peel_iters_prologue
3822                                  - vec_inside_cost * peel_iters_epilogue)
3823                                  / ((scalar_single_iter_cost * assumed_vf)
3824                                    - vec_inside_cost);
3825     }
3826   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3827   if (dump_enabled_p ())
3828     dump_printf_loc (MSG_NOTE, vect_location,
3829                      "  Static estimate profitability threshold = %d\n",
3830                      min_profitable_estimate);
3831
3832   *ret_min_profitable_estimate = min_profitable_estimate;
3833 }
3834
3835 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3836    vector elements (not bits) for a vector with NELT elements.  */
3837 static void
3838 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3839                               vec_perm_builder *sel)
3840 {
3841   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3842      by vec_perm_indices.  */
3843   sel->new_vector (nelt, 1, 3);
3844   for (unsigned int i = 0; i < 3; i++)
3845     sel->quick_push (i + offset);
3846 }
3847
3848 /* Checks whether the target supports whole-vector shifts for vectors of mode
3849    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3850    it supports vec_perm_const with masks for all necessary shift amounts.  */
3851 static bool
3852 have_whole_vector_shift (machine_mode mode)
3853 {
3854   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3855     return true;
3856
3857   /* Variable-length vectors should be handled via the optab.  */
3858   unsigned int nelt;
3859   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3860     return false;
3861
3862   vec_perm_builder sel;
3863   vec_perm_indices indices;
3864   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3865     {
3866       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3867       indices.new_vector (sel, 2, nelt);
3868       if (!can_vec_perm_const_p (mode, indices, false))
3869         return false;
3870     }
3871   return true;
3872 }
3873
3874 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3875    functions. Design better to avoid maintenance issues.  */
3876
3877 /* Function vect_model_reduction_cost.
3878
3879    Models cost for a reduction operation, including the vector ops
3880    generated within the strip-mine loop, the initial definition before
3881    the loop, and the epilogue code that must be generated.  */
3882
3883 static void
3884 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3885                            vect_reduction_type reduction_type,
3886                            int ncopies, stmt_vector_for_cost *cost_vec)
3887 {
3888   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3889   enum tree_code code;
3890   optab optab;
3891   tree vectype;
3892   machine_mode mode;
3893   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3894   class loop *loop = NULL;
3895
3896   if (loop_vinfo)
3897     loop = LOOP_VINFO_LOOP (loop_vinfo);
3898
3899   /* Condition reductions generate two reductions in the loop.  */
3900   if (reduction_type == COND_REDUCTION)
3901     ncopies *= 2;
3902
3903   vectype = STMT_VINFO_VECTYPE (stmt_info);
3904   mode = TYPE_MODE (vectype);
3905   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3906
3907   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3908
3909   if (reduction_type == EXTRACT_LAST_REDUCTION)
3910     /* No extra instructions are needed in the prologue.  The loop body
3911        operations are costed in vectorizable_condition.  */
3912     inside_cost = 0;
3913   else if (reduction_type == FOLD_LEFT_REDUCTION)
3914     {
3915       /* No extra instructions needed in the prologue.  */
3916       prologue_cost = 0;
3917
3918       if (reduc_fn != IFN_LAST)
3919         /* Count one reduction-like operation per vector.  */
3920         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3921                                         stmt_info, 0, vect_body);
3922       else
3923         {
3924           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3925           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3926           inside_cost = record_stmt_cost (cost_vec, nelements,
3927                                           vec_to_scalar, stmt_info, 0,
3928                                           vect_body);
3929           inside_cost += record_stmt_cost (cost_vec, nelements,
3930                                            scalar_stmt, stmt_info, 0,
3931                                            vect_body);
3932         }
3933     }
3934   else
3935     {
3936       /* Add in cost for initial definition.
3937          For cond reduction we have four vectors: initial index, step,
3938          initial result of the data reduction, initial value of the index
3939          reduction.  */
3940       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3941       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3942                                          scalar_to_vec, stmt_info, 0,
3943                                          vect_prologue);
3944
3945       /* Cost of reduction op inside loop.  */
3946       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3947                                       stmt_info, 0, vect_body);
3948     }
3949
3950   /* Determine cost of epilogue code.
3951
3952      We have a reduction operator that will reduce the vector in one statement.
3953      Also requires scalar extract.  */
3954
3955   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3956     {
3957       if (reduc_fn != IFN_LAST)
3958         {
3959           if (reduction_type == COND_REDUCTION)
3960             {
3961               /* An EQ stmt and an COND_EXPR stmt.  */
3962               epilogue_cost += record_stmt_cost (cost_vec, 2,
3963                                                  vector_stmt, stmt_info, 0,
3964                                                  vect_epilogue);
3965               /* Reduction of the max index and a reduction of the found
3966                  values.  */
3967               epilogue_cost += record_stmt_cost (cost_vec, 2,
3968                                                  vec_to_scalar, stmt_info, 0,
3969                                                  vect_epilogue);
3970               /* A broadcast of the max value.  */
3971               epilogue_cost += record_stmt_cost (cost_vec, 1,
3972                                                  scalar_to_vec, stmt_info, 0,
3973                                                  vect_epilogue);
3974             }
3975           else
3976             {
3977               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3978                                                  stmt_info, 0, vect_epilogue);
3979               epilogue_cost += record_stmt_cost (cost_vec, 1,
3980                                                  vec_to_scalar, stmt_info, 0,
3981                                                  vect_epilogue);
3982             }
3983         }
3984       else if (reduction_type == COND_REDUCTION)
3985         {
3986           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3987           /* Extraction of scalar elements.  */
3988           epilogue_cost += record_stmt_cost (cost_vec,
3989                                              2 * estimated_nunits,
3990                                              vec_to_scalar, stmt_info, 0,
3991                                              vect_epilogue);
3992           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3993           epilogue_cost += record_stmt_cost (cost_vec,
3994                                              2 * estimated_nunits - 3,
3995                                              scalar_stmt, stmt_info, 0,
3996                                              vect_epilogue);
3997         }
3998       else if (reduction_type == EXTRACT_LAST_REDUCTION
3999                || reduction_type == FOLD_LEFT_REDUCTION)
4000         /* No extra instructions need in the epilogue.  */
4001         ;
4002       else
4003         {
4004           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4005           tree bitsize =
4006             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4007           int element_bitsize = tree_to_uhwi (bitsize);
4008           int nelements = vec_size_in_bits / element_bitsize;
4009
4010           if (code == COND_EXPR)
4011             code = MAX_EXPR;
4012
4013           optab = optab_for_tree_code (code, vectype, optab_default);
4014
4015           /* We have a whole vector shift available.  */
4016           if (optab != unknown_optab
4017               && VECTOR_MODE_P (mode)
4018               && optab_handler (optab, mode) != CODE_FOR_nothing
4019               && have_whole_vector_shift (mode))
4020             {
4021               /* Final reduction via vector shifts and the reduction operator.
4022                  Also requires scalar extract.  */
4023               epilogue_cost += record_stmt_cost (cost_vec,
4024                                                  exact_log2 (nelements) * 2,
4025                                                  vector_stmt, stmt_info, 0,
4026                                                  vect_epilogue);
4027               epilogue_cost += record_stmt_cost (cost_vec, 1,
4028                                                  vec_to_scalar, stmt_info, 0,
4029                                                  vect_epilogue);
4030             }
4031           else
4032             /* Use extracts and reduction op for final reduction.  For N
4033                elements, we have N extracts and N-1 reduction ops.  */
4034             epilogue_cost += record_stmt_cost (cost_vec,
4035                                                nelements + nelements - 1,
4036                                                vector_stmt, stmt_info, 0,
4037                                                vect_epilogue);
4038         }
4039     }
4040
4041   if (dump_enabled_p ())
4042     dump_printf (MSG_NOTE,
4043                  "vect_model_reduction_cost: inside_cost = %d, "
4044                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4045                  prologue_cost, epilogue_cost);
4046 }
4047
4048
4049 /* Function vect_model_induction_cost.
4050
4051    Models cost for induction operations.  */
4052
4053 static void
4054 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4055                            stmt_vector_for_cost *cost_vec)
4056 {
4057   unsigned inside_cost, prologue_cost;
4058
4059   if (PURE_SLP_STMT (stmt_info))
4060     return;
4061
4062   /* loop cost for vec_loop.  */
4063   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4064                                   stmt_info, 0, vect_body);
4065
4066   /* prologue cost for vec_init and vec_step.  */
4067   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4068                                     stmt_info, 0, vect_prologue);
4069
4070   if (dump_enabled_p ())
4071     dump_printf_loc (MSG_NOTE, vect_location,
4072                      "vect_model_induction_cost: inside_cost = %d, "
4073                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4074 }
4075
4076
4077
4078 /* Function get_initial_def_for_reduction
4079
4080    Input:
4081    STMT_VINFO - a stmt that performs a reduction operation in the loop.
4082    INIT_VAL - the initial value of the reduction variable
4083
4084    Output:
4085    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4086         of the reduction (used for adjusting the epilog - see below).
4087    Return a vector variable, initialized according to the operation that
4088         STMT_VINFO performs. This vector will be used as the initial value
4089         of the vector of partial results.
4090
4091    Option1 (adjust in epilog): Initialize the vector as follows:
4092      add/bit or/xor:    [0,0,...,0,0]
4093      mult/bit and:      [1,1,...,1,1]
4094      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4095    and when necessary (e.g. add/mult case) let the caller know
4096    that it needs to adjust the result by init_val.
4097
4098    Option2: Initialize the vector as follows:
4099      add/bit or/xor:    [init_val,0,0,...,0]
4100      mult/bit and:      [init_val,1,1,...,1]
4101      min/max/cond_expr: [init_val,init_val,...,init_val]
4102    and no adjustments are needed.
4103
4104    For example, for the following code:
4105
4106    s = init_val;
4107    for (i=0;i<n;i++)
4108      s = s + a[i];
4109
4110    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4111    For a vector of 4 units, we want to return either [0,0,0,init_val],
4112    or [0,0,0,0] and let the caller know that it needs to adjust
4113    the result at the end by 'init_val'.
4114
4115    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4116    initialization vector is simpler (same element in all entries), if
4117    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4118
4119    A cost model should help decide between these two schemes.  */
4120
4121 static tree
4122 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo,
4123                                enum tree_code code, tree init_val,
4124                                tree *adjustment_def)
4125 {
4126   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4127   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4128   tree scalar_type = TREE_TYPE (init_val);
4129   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4130   tree def_for_init;
4131   tree init_def;
4132   REAL_VALUE_TYPE real_init_val = dconst0;
4133   int int_init_val = 0;
4134   gimple_seq stmts = NULL;
4135
4136   gcc_assert (vectype);
4137
4138   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4139               || SCALAR_FLOAT_TYPE_P (scalar_type));
4140
4141   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4142               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4143
4144   /* ADJUSTMENT_DEF is NULL when called from
4145      vect_create_epilog_for_reduction to vectorize double reduction.  */
4146   if (adjustment_def)
4147     *adjustment_def = NULL;
4148
4149   switch (code)
4150     {
4151     case WIDEN_SUM_EXPR:
4152     case DOT_PROD_EXPR:
4153     case SAD_EXPR:
4154     case PLUS_EXPR:
4155     case MINUS_EXPR:
4156     case BIT_IOR_EXPR:
4157     case BIT_XOR_EXPR:
4158     case MULT_EXPR:
4159     case BIT_AND_EXPR:
4160       {
4161         if (code == MULT_EXPR)
4162           {
4163             real_init_val = dconst1;
4164             int_init_val = 1;
4165           }
4166
4167         if (code == BIT_AND_EXPR)
4168           int_init_val = -1;
4169
4170         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4171           def_for_init = build_real (scalar_type, real_init_val);
4172         else
4173           def_for_init = build_int_cst (scalar_type, int_init_val);
4174
4175         if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4176           {
4177             /* Option1: the first element is '0' or '1' as well.  */
4178             if (!operand_equal_p (def_for_init, init_val, 0))
4179               *adjustment_def = init_val;
4180             init_def = gimple_build_vector_from_val (&stmts, vectype,
4181                                                      def_for_init);
4182           }
4183         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4184           {
4185             /* Option2 (variable length): the first element is INIT_VAL.  */
4186             init_def = gimple_build_vector_from_val (&stmts, vectype,
4187                                                      def_for_init);
4188             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4189                                      vectype, init_def, init_val);
4190           }
4191         else
4192           {
4193             /* Option2: the first element is INIT_VAL.  */
4194             tree_vector_builder elts (vectype, 1, 2);
4195             elts.quick_push (init_val);
4196             elts.quick_push (def_for_init);
4197             init_def = gimple_build_vector (&stmts, &elts);
4198           }
4199       }
4200       break;
4201
4202     case MIN_EXPR:
4203     case MAX_EXPR:
4204     case COND_EXPR:
4205       {
4206         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4207         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4208       }
4209       break;
4210
4211     default:
4212       gcc_unreachable ();
4213     }
4214
4215   if (stmts)
4216     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4217   return init_def;
4218 }
4219
4220 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4221    NUMBER_OF_VECTORS is the number of vector defs to create.
4222    If NEUTRAL_OP is nonnull, introducing extra elements of that
4223    value will not change the result.  */
4224
4225 static void
4226 get_initial_defs_for_reduction (slp_tree slp_node,
4227                                 vec<tree> *vec_oprnds,
4228                                 unsigned int number_of_vectors,
4229                                 bool reduc_chain, tree neutral_op)
4230 {
4231   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4232   stmt_vec_info stmt_vinfo = stmts[0];
4233   vec_info *vinfo = stmt_vinfo->vinfo;
4234   unsigned HOST_WIDE_INT nunits;
4235   unsigned j, number_of_places_left_in_vector;
4236   tree vector_type;
4237   unsigned int group_size = stmts.length ();
4238   unsigned int i;
4239   class loop *loop;
4240
4241   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4242
4243   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4244
4245   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4246   gcc_assert (loop);
4247   edge pe = loop_preheader_edge (loop);
4248
4249   gcc_assert (!reduc_chain || neutral_op);
4250
4251   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4252      created vectors. It is greater than 1 if unrolling is performed.
4253
4254      For example, we have two scalar operands, s1 and s2 (e.g., group of
4255      strided accesses of size two), while NUNITS is four (i.e., four scalars
4256      of this type can be packed in a vector).  The output vector will contain
4257      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4258      will be 2).
4259
4260      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4261      vectors containing the operands.
4262
4263      For example, NUNITS is four as before, and the group size is 8
4264      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4265      {s5, s6, s7, s8}.  */
4266
4267   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4268     nunits = group_size;
4269
4270   number_of_places_left_in_vector = nunits;
4271   bool constant_p = true;
4272   tree_vector_builder elts (vector_type, nunits, 1);
4273   elts.quick_grow (nunits);
4274   gimple_seq ctor_seq = NULL;
4275   for (j = 0; j < nunits * number_of_vectors; ++j)
4276     {
4277       tree op;
4278       i = j % group_size;
4279       stmt_vinfo = stmts[i];
4280
4281       /* Get the def before the loop.  In reduction chain we have only
4282          one initial value.  Else we have as many as PHIs in the group.  */
4283       if (reduc_chain)
4284         op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4285       else if (((vec_oprnds->length () + 1) * nunits
4286                 - number_of_places_left_in_vector >= group_size)
4287                && neutral_op)
4288         op = neutral_op;
4289       else
4290         op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4291
4292       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4293       number_of_places_left_in_vector--;
4294       elts[nunits - number_of_places_left_in_vector - 1] = op;
4295       if (!CONSTANT_CLASS_P (op))
4296         constant_p = false;
4297
4298       if (number_of_places_left_in_vector == 0)
4299         {
4300           tree init;
4301           if (constant_p && !neutral_op
4302               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4303               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4304             /* Build the vector directly from ELTS.  */
4305             init = gimple_build_vector (&ctor_seq, &elts);
4306           else if (neutral_op)
4307             {
4308               /* Build a vector of the neutral value and shift the
4309                  other elements into place.  */
4310               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4311                                                    neutral_op);
4312               int k = nunits;
4313               while (k > 0 && elts[k - 1] == neutral_op)
4314                 k -= 1;
4315               while (k > 0)
4316                 {
4317                   k -= 1;
4318                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4319                                        vector_type, init, elts[k]);
4320                 }
4321             }
4322           else
4323             {
4324               /* First time round, duplicate ELTS to fill the
4325                  required number of vectors.  */
4326               duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4327                                         number_of_vectors, *vec_oprnds);
4328               break;
4329             }
4330           vec_oprnds->quick_push (init);
4331
4332           number_of_places_left_in_vector = nunits;
4333           elts.new_vector (vector_type, nunits, 1);
4334           elts.quick_grow (nunits);
4335           constant_p = true;
4336         }
4337     }
4338   if (ctor_seq != NULL)
4339     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4340 }
4341
4342 /* For a statement STMT_INFO taking part in a reduction operation return
4343    the stmt_vec_info the meta information is stored on.  */
4344
4345 stmt_vec_info
4346 info_for_reduction (stmt_vec_info stmt_info)
4347 {
4348   stmt_info = vect_orig_stmt (stmt_info);
4349   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4350   if (!is_a <gphi *> (stmt_info->stmt))
4351     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4352   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4353   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4354     {
4355       if (gimple_phi_num_args (phi) == 1)
4356         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4357     }
4358   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4359     {
4360       edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4361       stmt_vec_info info
4362           = stmt_info->vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4363       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4364         stmt_info = info;
4365     }
4366   return stmt_info;
4367 }
4368
4369 /* Function vect_create_epilog_for_reduction
4370
4371    Create code at the loop-epilog to finalize the result of a reduction
4372    computation.
4373
4374    STMT_INFO is the scalar reduction stmt that is being vectorized.
4375    SLP_NODE is an SLP node containing a group of reduction statements. The
4376      first one in this group is STMT_INFO.
4377    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4378    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4379      (counting from 0)
4380
4381    This function:
4382    1. Completes the reduction def-use cycles.
4383    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4384       by calling the function specified by REDUC_FN if available, or by
4385       other means (whole-vector shifts or a scalar loop).
4386       The function also creates a new phi node at the loop exit to preserve
4387       loop-closed form, as illustrated below.
4388
4389      The flow at the entry to this function:
4390
4391         loop:
4392           vec_def = phi <vec_init, null>        # REDUCTION_PHI
4393           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4394           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4395         loop_exit:
4396           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4397           use <s_out0>
4398           use <s_out0>
4399
4400      The above is transformed by this function into:
4401
4402         loop:
4403           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4404           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4405           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4406         loop_exit:
4407           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4408           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4409           v_out2 = reduce <v_out1>
4410           s_out3 = extract_field <v_out2, 0>
4411           s_out4 = adjust_result <s_out3>
4412           use <s_out4>
4413           use <s_out4>
4414 */
4415
4416 static void
4417 vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
4418                                   slp_tree slp_node,
4419                                   slp_instance slp_node_instance)
4420 {
4421   stmt_vec_info reduc_info = info_for_reduction (stmt_info);
4422   gcc_assert (reduc_info->is_reduc_info);
4423   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4424   /* For double reductions we need to get at the inner loop reduction
4425      stmt which has the meta info attached.  Our stmt_info is that of the
4426      loop-closed PHI of the inner loop which we remember as
4427      def for the reduction PHI generation.  */
4428   bool double_reduc = false;
4429   stmt_vec_info rdef_info = stmt_info;
4430   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4431     {
4432       gcc_assert (!slp_node);
4433       double_reduc = true;
4434       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4435                                             (stmt_info->stmt, 0));
4436       stmt_info = vect_stmt_to_vectorize (stmt_info);
4437     }
4438   gphi *reduc_def_stmt
4439     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4440   enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4441   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4442   stmt_vec_info prev_phi_info;
4443   tree vectype;
4444   machine_mode mode;
4445   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4446   basic_block exit_bb;
4447   tree scalar_dest;
4448   tree scalar_type;
4449   gimple *new_phi = NULL, *phi;
4450   stmt_vec_info phi_info;
4451   gimple_stmt_iterator exit_gsi;
4452   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4453   gimple *epilog_stmt = NULL;
4454   gimple *exit_phi;
4455   tree bitsize;
4456   tree def;
4457   tree orig_name, scalar_result;
4458   imm_use_iterator imm_iter, phi_imm_iter;
4459   use_operand_p use_p, phi_use_p;
4460   gimple *use_stmt;
4461   bool nested_in_vect_loop = false;
4462   auto_vec<gimple *> new_phis;
4463   int j, i;
4464   auto_vec<tree> scalar_results;
4465   unsigned int group_size = 1, k;
4466   auto_vec<gimple *> phis;
4467   bool slp_reduc = false;
4468   bool direct_slp_reduc;
4469   tree new_phi_result;
4470   tree induction_index = NULL_TREE;
4471
4472   if (slp_node)
4473     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4474
4475   if (nested_in_vect_loop_p (loop, stmt_info))
4476     {
4477       outer_loop = loop;
4478       loop = loop->inner;
4479       nested_in_vect_loop = true;
4480       gcc_assert (!slp_node);
4481     }
4482   gcc_assert (!nested_in_vect_loop || double_reduc);
4483
4484   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
4485   gcc_assert (vectype);
4486   mode = TYPE_MODE (vectype);
4487
4488   tree initial_def = NULL;
4489   tree induc_val = NULL_TREE;
4490   tree adjustment_def = NULL;
4491   if (slp_node)
4492     ;
4493   else
4494     {
4495       /* Get at the scalar def before the loop, that defines the initial value
4496          of the reduction variable.  */
4497       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4498                                            loop_preheader_edge (loop));
4499       /* Optimize: for induction condition reduction, if we can't use zero
4500          for induc_val, use initial_def.  */
4501       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4502         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4503       else if (double_reduc)
4504         ;
4505       else if (nested_in_vect_loop)
4506         ;
4507       else
4508         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4509     }
4510
4511   unsigned vec_num;
4512   int ncopies;
4513   if (slp_node)
4514     {
4515       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4516       ncopies = 1;
4517     }
4518   else
4519     {
4520       vec_num = 1;
4521       ncopies = 0;
4522       phi_info = STMT_VINFO_VEC_STMT (loop_vinfo->lookup_stmt (reduc_def_stmt));
4523       do
4524         {
4525           ncopies++;
4526           phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4527         }
4528       while (phi_info);
4529     }
4530
4531   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4532      which is updated with the current index of the loop for every match of
4533      the original loop's cond_expr (VEC_STMT).  This results in a vector
4534      containing the last time the condition passed for that vector lane.
4535      The first match will be a 1 to allow 0 to be used for non-matching
4536      indexes.  If there are no matches at all then the vector will be all
4537      zeroes.
4538
4539      PR92772: This algorithm is broken for architectures that support
4540      masked vectors, but do not provide fold_extract_last.  */
4541   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4542     {
4543       auto_vec<std::pair<tree, bool>, 2> ccompares;
4544       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
4545       cond_info = vect_stmt_to_vectorize (cond_info);
4546       while (cond_info != reduc_info)
4547         {
4548           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
4549             {
4550               gimple *vec_stmt = STMT_VINFO_VEC_STMT (cond_info)->stmt;
4551               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4552               ccompares.safe_push
4553                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
4554                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
4555             }
4556           cond_info
4557             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
4558                                                  1 + STMT_VINFO_REDUC_IDX
4559                                                         (cond_info)));
4560           cond_info = vect_stmt_to_vectorize (cond_info);
4561         }
4562       gcc_assert (ccompares.length () != 0);
4563
4564       tree indx_before_incr, indx_after_incr;
4565       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4566       int scalar_precision
4567         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4568       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4569       tree cr_index_vector_type = get_related_vectype_for_scalar_type
4570         (TYPE_MODE (vectype), cr_index_scalar_type,
4571          TYPE_VECTOR_SUBPARTS (vectype));
4572
4573       /* First we create a simple vector induction variable which starts
4574          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4575          vector size (STEP).  */
4576
4577       /* Create a {1,2,3,...} vector.  */
4578       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4579
4580       /* Create a vector of the step value.  */
4581       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4582       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4583
4584       /* Create an induction variable.  */
4585       gimple_stmt_iterator incr_gsi;
4586       bool insert_after;
4587       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4588       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4589                  insert_after, &indx_before_incr, &indx_after_incr);
4590
4591       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4592          filled with zeros (VEC_ZERO).  */
4593
4594       /* Create a vector of 0s.  */
4595       tree zero = build_zero_cst (cr_index_scalar_type);
4596       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4597
4598       /* Create a vector phi node.  */
4599       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4600       new_phi = create_phi_node (new_phi_tree, loop->header);
4601       loop_vinfo->add_stmt (new_phi);
4602       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4603                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4604
4605       /* Now take the condition from the loops original cond_exprs
4606          and produce a new cond_exprs (INDEX_COND_EXPR) which for
4607          every match uses values from the induction variable
4608          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4609          (NEW_PHI_TREE).
4610          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4611          the new cond_expr (INDEX_COND_EXPR).  */
4612       gimple_seq stmts = NULL;
4613       for (int i = ccompares.length () - 1; i != -1; --i)
4614         {
4615           tree ccompare = ccompares[i].first;
4616           if (ccompares[i].second)
4617             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4618                                          cr_index_vector_type,
4619                                          ccompare,
4620                                          indx_before_incr, new_phi_tree);
4621           else
4622             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4623                                          cr_index_vector_type,
4624                                          ccompare,
4625                                          new_phi_tree, indx_before_incr);
4626         }
4627       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
4628       stmt_vec_info index_vec_info
4629         = loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (new_phi_tree));
4630       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4631
4632       /* Update the phi with the vec cond.  */
4633       induction_index = new_phi_tree;
4634       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4635                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4636     }
4637
4638   /* 2. Create epilog code.
4639         The reduction epilog code operates across the elements of the vector
4640         of partial results computed by the vectorized loop.
4641         The reduction epilog code consists of:
4642
4643         step 1: compute the scalar result in a vector (v_out2)
4644         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4645         step 3: adjust the scalar result (s_out3) if needed.
4646
4647         Step 1 can be accomplished using one the following three schemes:
4648           (scheme 1) using reduc_fn, if available.
4649           (scheme 2) using whole-vector shifts, if available.
4650           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4651                      combined.
4652
4653           The overall epilog code looks like this:
4654
4655           s_out0 = phi <s_loop>         # original EXIT_PHI
4656           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4657           v_out2 = reduce <v_out1>              # step 1
4658           s_out3 = extract_field <v_out2, 0>    # step 2
4659           s_out4 = adjust_result <s_out3>       # step 3
4660
4661           (step 3 is optional, and steps 1 and 2 may be combined).
4662           Lastly, the uses of s_out0 are replaced by s_out4.  */
4663
4664
4665   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4666          v_out1 = phi <VECT_DEF>
4667          Store them in NEW_PHIS.  */
4668   if (double_reduc)
4669     loop = outer_loop;
4670   exit_bb = single_exit (loop)->dest;
4671   prev_phi_info = NULL;
4672   new_phis.create (slp_node ? vec_num : ncopies);
4673   for (unsigned i = 0; i < vec_num; i++)
4674     {
4675       if (slp_node)
4676         def = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]->stmt);
4677       else
4678         def = gimple_get_lhs (STMT_VINFO_VEC_STMT (rdef_info)->stmt);
4679       for (j = 0; j < ncopies; j++)
4680         {
4681           tree new_def = copy_ssa_name (def);
4682           phi = create_phi_node (new_def, exit_bb);
4683           stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4684           if (j == 0)
4685             new_phis.quick_push (phi);
4686           else
4687             {
4688               def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4689               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4690             }
4691
4692           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4693           prev_phi_info = phi_info;
4694         }
4695     }
4696
4697   exit_gsi = gsi_after_labels (exit_bb);
4698
4699   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4700          (i.e. when reduc_fn is not available) and in the final adjustment
4701          code (if needed).  Also get the original scalar reduction variable as
4702          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4703          represents a reduction pattern), the tree-code and scalar-def are
4704          taken from the original stmt that the pattern-stmt (STMT) replaces.
4705          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4706          are taken from STMT.  */
4707
4708   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4709   if (orig_stmt_info != stmt_info)
4710     {
4711       /* Reduction pattern  */
4712       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4713       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4714     }
4715
4716   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4717   scalar_type = TREE_TYPE (scalar_dest);
4718   scalar_results.create (group_size);
4719   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4720   bitsize = TYPE_SIZE (scalar_type);
4721
4722   /* SLP reduction without reduction chain, e.g.,
4723      # a1 = phi <a2, a0>
4724      # b1 = phi <b2, b0>
4725      a2 = operation (a1)
4726      b2 = operation (b1)  */
4727   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4728
4729   /* True if we should implement SLP_REDUC using native reduction operations
4730      instead of scalar operations.  */
4731   direct_slp_reduc = (reduc_fn != IFN_LAST
4732                       && slp_reduc
4733                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4734
4735   /* In case of reduction chain, e.g.,
4736      # a1 = phi <a3, a0>
4737      a2 = operation (a1)
4738      a3 = operation (a2),
4739
4740      we may end up with more than one vector result.  Here we reduce them to
4741      one vector.  */
4742   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4743     {
4744       gimple_seq stmts = NULL;
4745       tree first_vect = PHI_RESULT (new_phis[0]);
4746       first_vect = gimple_convert (&stmts, vectype, first_vect);
4747       for (k = 1; k < new_phis.length (); k++)
4748         {
4749           gimple *next_phi = new_phis[k];
4750           tree second_vect = PHI_RESULT (next_phi);
4751           second_vect = gimple_convert (&stmts, vectype, second_vect);
4752           first_vect = gimple_build (&stmts, code, vectype,
4753                                      first_vect, second_vect);
4754         }
4755       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4756
4757       new_phi_result = first_vect;
4758       new_phis.truncate (0);
4759       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
4760     }
4761   /* Likewise if we couldn't use a single defuse cycle.  */
4762   else if (ncopies > 1)
4763     {
4764       gcc_assert (new_phis.length () == 1);
4765       gimple_seq stmts = NULL;
4766       tree first_vect = PHI_RESULT (new_phis[0]);
4767       first_vect = gimple_convert (&stmts, vectype, first_vect);
4768       stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4769       for (int k = 1; k < ncopies; ++k)
4770         {
4771           next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4772           tree second_vect = PHI_RESULT (next_phi_info->stmt);
4773           second_vect = gimple_convert (&stmts, vectype, second_vect);
4774           first_vect = gimple_build (&stmts, code, vectype,
4775                                      first_vect, second_vect);
4776         }
4777       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4778       new_phi_result = first_vect;
4779       new_phis.truncate (0);
4780       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
4781     }
4782   else
4783     new_phi_result = PHI_RESULT (new_phis[0]);
4784
4785   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4786       && reduc_fn != IFN_LAST)
4787     {
4788       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4789          various data values where the condition matched and another vector
4790          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4791          need to extract the last matching index (which will be the index with
4792          highest value) and use this to index into the data vector.
4793          For the case where there were no matches, the data vector will contain
4794          all default values and the index vector will be all zeros.  */
4795
4796       /* Get various versions of the type of the vector of indexes.  */
4797       tree index_vec_type = TREE_TYPE (induction_index);
4798       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4799       tree index_scalar_type = TREE_TYPE (index_vec_type);
4800       tree index_vec_cmp_type = truth_type_for (index_vec_type);
4801
4802       /* Get an unsigned integer version of the type of the data vector.  */
4803       int scalar_precision
4804         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4805       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4806       tree vectype_unsigned = build_vector_type
4807         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4808
4809       /* First we need to create a vector (ZERO_VEC) of zeros and another
4810          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4811          can create using a MAX reduction and then expanding.
4812          In the case where the loop never made any matches, the max index will
4813          be zero.  */
4814
4815       /* Vector of {0, 0, 0,...}.  */
4816       tree zero_vec = build_zero_cst (vectype);
4817
4818       gimple_seq stmts = NULL;
4819       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
4820       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4821
4822       /* Find maximum value from the vector of found indexes.  */
4823       tree max_index = make_ssa_name (index_scalar_type);
4824       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4825                                                           1, induction_index);
4826       gimple_call_set_lhs (max_index_stmt, max_index);
4827       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4828
4829       /* Vector of {max_index, max_index, max_index,...}.  */
4830       tree max_index_vec = make_ssa_name (index_vec_type);
4831       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4832                                                       max_index);
4833       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4834                                                         max_index_vec_rhs);
4835       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4836
4837       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4838          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4839          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4840          otherwise.  Only one value should match, resulting in a vector
4841          (VEC_COND) with one data value and the rest zeros.
4842          In the case where the loop never made any matches, every index will
4843          match, resulting in a vector with all data values (which will all be
4844          the default value).  */
4845
4846       /* Compare the max index vector to the vector of found indexes to find
4847          the position of the max value.  */
4848       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4849       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4850                                                       induction_index,
4851                                                       max_index_vec);
4852       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4853
4854       /* Use the compare to choose either values from the data vector or
4855          zero.  */
4856       tree vec_cond = make_ssa_name (vectype);
4857       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4858                                                    vec_compare, new_phi_result,
4859                                                    zero_vec);
4860       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4861
4862       /* Finally we need to extract the data value from the vector (VEC_COND)
4863          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4864          reduction, but because this doesn't exist, we can use a MAX reduction
4865          instead.  The data value might be signed or a float so we need to cast
4866          it first.
4867          In the case where the loop never made any matches, the data values are
4868          all identical, and so will reduce down correctly.  */
4869
4870       /* Make the matched data values unsigned.  */
4871       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4872       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4873                                        vec_cond);
4874       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4875                                                         VIEW_CONVERT_EXPR,
4876                                                         vec_cond_cast_rhs);
4877       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4878
4879       /* Reduce down to a scalar value.  */
4880       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4881       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4882                                                            1, vec_cond_cast);
4883       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4884       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4885
4886       /* Convert the reduced value back to the result type and set as the
4887          result.  */
4888       stmts = NULL;
4889       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4890                                data_reduc);
4891       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4892       scalar_results.safe_push (new_temp);
4893     }
4894   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4895            && reduc_fn == IFN_LAST)
4896     {
4897       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4898          idx = 0;
4899          idx_val = induction_index[0];
4900          val = data_reduc[0];
4901          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4902            if (induction_index[i] > idx_val)
4903              val = data_reduc[i], idx_val = induction_index[i];
4904          return val;  */
4905
4906       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4907       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4908       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4909       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4910       /* Enforced by vectorizable_reduction, which ensures we have target
4911          support before allowing a conditional reduction on variable-length
4912          vectors.  */
4913       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4914       tree idx_val = NULL_TREE, val = NULL_TREE;
4915       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4916         {
4917           tree old_idx_val = idx_val;
4918           tree old_val = val;
4919           idx_val = make_ssa_name (idx_eltype);
4920           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4921                                              build3 (BIT_FIELD_REF, idx_eltype,
4922                                                      induction_index,
4923                                                      bitsize_int (el_size),
4924                                                      bitsize_int (off)));
4925           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4926           val = make_ssa_name (data_eltype);
4927           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4928                                              build3 (BIT_FIELD_REF,
4929                                                      data_eltype,
4930                                                      new_phi_result,
4931                                                      bitsize_int (el_size),
4932                                                      bitsize_int (off)));
4933           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4934           if (off != 0)
4935             {
4936               tree new_idx_val = idx_val;
4937               if (off != v_size - el_size)
4938                 {
4939                   new_idx_val = make_ssa_name (idx_eltype);
4940                   epilog_stmt = gimple_build_assign (new_idx_val,
4941                                                      MAX_EXPR, idx_val,
4942                                                      old_idx_val);
4943                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4944                 }
4945               tree new_val = make_ssa_name (data_eltype);
4946               epilog_stmt = gimple_build_assign (new_val,
4947                                                  COND_EXPR,
4948                                                  build2 (GT_EXPR,
4949                                                          boolean_type_node,
4950                                                          idx_val,
4951                                                          old_idx_val),
4952                                                  val, old_val);
4953               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4954               idx_val = new_idx_val;
4955               val = new_val;
4956             }
4957         }
4958       /* Convert the reduced value back to the result type and set as the
4959          result.  */
4960       gimple_seq stmts = NULL;
4961       val = gimple_convert (&stmts, scalar_type, val);
4962       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4963       scalar_results.safe_push (val);
4964     }
4965
4966   /* 2.3 Create the reduction code, using one of the three schemes described
4967          above. In SLP we simply need to extract all the elements from the
4968          vector (without reducing them), so we use scalar shifts.  */
4969   else if (reduc_fn != IFN_LAST && !slp_reduc)
4970     {
4971       tree tmp;
4972       tree vec_elem_type;
4973
4974       /* Case 1:  Create:
4975          v_out2 = reduc_expr <v_out1>  */
4976
4977       if (dump_enabled_p ())
4978         dump_printf_loc (MSG_NOTE, vect_location,
4979                          "Reduce using direct vector reduction.\n");
4980
4981       gimple_seq stmts = NULL;
4982       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
4983       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4984       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
4985                                vec_elem_type, new_phi_result);
4986       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
4987       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4988
4989       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4990           && induc_val)
4991         {
4992           /* Earlier we set the initial value to be a vector if induc_val
4993              values.  Check the result and if it is induc_val then replace
4994              with the original initial value, unless induc_val is
4995              the same as initial_def already.  */
4996           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
4997                                   induc_val);
4998
4999           tmp = make_ssa_name (new_scalar_dest);
5000           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5001                                              initial_def, new_temp);
5002           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5003           new_temp = tmp;
5004         }
5005
5006       scalar_results.safe_push (new_temp);
5007     }
5008   else if (direct_slp_reduc)
5009     {
5010       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5011          with the elements for other SLP statements replaced with the
5012          neutral value.  We can then do a normal reduction on each vector.  */
5013
5014       /* Enforced by vectorizable_reduction.  */
5015       gcc_assert (new_phis.length () == 1);
5016       gcc_assert (pow2p_hwi (group_size));
5017
5018       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5019       vec<stmt_vec_info> orig_phis
5020         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5021       gimple_seq seq = NULL;
5022
5023       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5024          and the same element size as VECTYPE.  */
5025       tree index = build_index_vector (vectype, 0, 1);
5026       tree index_type = TREE_TYPE (index);
5027       tree index_elt_type = TREE_TYPE (index_type);
5028       tree mask_type = truth_type_for (index_type);
5029
5030       /* Create a vector that, for each element, identifies which of
5031          the REDUC_GROUP_SIZE results should use it.  */
5032       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5033       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5034                             build_vector_from_val (index_type, index_mask));
5035
5036       /* Get a neutral vector value.  This is simply a splat of the neutral
5037          scalar value if we have one, otherwise the initial scalar value
5038          is itself a neutral value.  */
5039       tree vector_identity = NULL_TREE;
5040       tree neutral_op = NULL_TREE;
5041       if (slp_node)
5042         {
5043           stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5044           neutral_op
5045             = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5046                                             vectype, code, first != NULL);
5047         }
5048       if (neutral_op)
5049         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5050                                                         neutral_op);
5051       for (unsigned int i = 0; i < group_size; ++i)
5052         {
5053           /* If there's no univeral neutral value, we can use the
5054              initial scalar value from the original PHI.  This is used
5055              for MIN and MAX reduction, for example.  */
5056           if (!neutral_op)
5057             {
5058               tree scalar_value
5059                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5060                                          loop_preheader_edge (loop));
5061               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5062                                              scalar_value);
5063               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5064                                                               scalar_value);
5065             }
5066
5067           /* Calculate the equivalent of:
5068
5069              sel[j] = (index[j] == i);
5070
5071              which selects the elements of NEW_PHI_RESULT that should
5072              be included in the result.  */
5073           tree compare_val = build_int_cst (index_elt_type, i);
5074           compare_val = build_vector_from_val (index_type, compare_val);
5075           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5076                                    index, compare_val);
5077
5078           /* Calculate the equivalent of:
5079
5080              vec = seq ? new_phi_result : vector_identity;
5081
5082              VEC is now suitable for a full vector reduction.  */
5083           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5084                                    sel, new_phi_result, vector_identity);
5085
5086           /* Do the reduction and convert it to the appropriate type.  */
5087           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5088                                       TREE_TYPE (vectype), vec);
5089           scalar = gimple_convert (&seq, scalar_type, scalar);
5090           scalar_results.safe_push (scalar);
5091         }
5092       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5093     }
5094   else
5095     {
5096       bool reduce_with_shift;
5097       tree vec_temp;
5098
5099       gcc_assert (slp_reduc || new_phis.length () == 1);
5100
5101       /* See if the target wants to do the final (shift) reduction
5102          in a vector mode of smaller size and first reduce upper/lower
5103          halves against each other.  */
5104       enum machine_mode mode1 = mode;
5105       tree stype = TREE_TYPE (vectype);
5106       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5107       unsigned nunits1 = nunits;
5108       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5109           && new_phis.length () == 1)
5110         {
5111           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5112           /* For SLP reductions we have to make sure lanes match up, but
5113              since we're doing individual element final reduction reducing
5114              vector width here is even more important.
5115              ???  We can also separate lanes with permutes, for the common
5116              case of power-of-two group-size odd/even extracts would work.  */
5117           if (slp_reduc && nunits != nunits1)
5118             {
5119               nunits1 = least_common_multiple (nunits1, group_size);
5120               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5121             }
5122         }
5123       if (!slp_reduc
5124           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5125         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5126
5127       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5128                                                            stype, nunits1);
5129       reduce_with_shift = have_whole_vector_shift (mode1);
5130       if (!VECTOR_MODE_P (mode1))
5131         reduce_with_shift = false;
5132       else
5133         {
5134           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5135           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5136             reduce_with_shift = false;
5137         }
5138
5139       /* First reduce the vector to the desired vector size we should
5140          do shift reduction on by combining upper and lower halves.  */
5141       new_temp = new_phi_result;
5142       while (nunits > nunits1)
5143         {
5144           nunits /= 2;
5145           vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5146                                                           stype, nunits);
5147           unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5148
5149           /* The target has to make sure we support lowpart/highpart
5150              extraction, either via direct vector extract or through
5151              an integer mode punning.  */
5152           tree dst1, dst2;
5153           if (convert_optab_handler (vec_extract_optab,
5154                                      TYPE_MODE (TREE_TYPE (new_temp)),
5155                                      TYPE_MODE (vectype1))
5156               != CODE_FOR_nothing)
5157             {
5158               /* Extract sub-vectors directly once vec_extract becomes
5159                  a conversion optab.  */
5160               dst1 = make_ssa_name (vectype1);
5161               epilog_stmt
5162                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5163                                          build3 (BIT_FIELD_REF, vectype1,
5164                                                  new_temp, TYPE_SIZE (vectype1),
5165                                                  bitsize_int (0)));
5166               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5167               dst2 =  make_ssa_name (vectype1);
5168               epilog_stmt
5169                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5170                                          build3 (BIT_FIELD_REF, vectype1,
5171                                                  new_temp, TYPE_SIZE (vectype1),
5172                                                  bitsize_int (bitsize)));
5173               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5174             }
5175           else
5176             {
5177               /* Extract via punning to appropriately sized integer mode
5178                  vector.  */
5179               tree eltype = build_nonstandard_integer_type (bitsize, 1);
5180               tree etype = build_vector_type (eltype, 2);
5181               gcc_assert (convert_optab_handler (vec_extract_optab,
5182                                                  TYPE_MODE (etype),
5183                                                  TYPE_MODE (eltype))
5184                           != CODE_FOR_nothing);
5185               tree tem = make_ssa_name (etype);
5186               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5187                                                  build1 (VIEW_CONVERT_EXPR,
5188                                                          etype, new_temp));
5189               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5190               new_temp = tem;
5191               tem = make_ssa_name (eltype);
5192               epilog_stmt
5193                   = gimple_build_assign (tem, BIT_FIELD_REF,
5194                                          build3 (BIT_FIELD_REF, eltype,
5195                                                  new_temp, TYPE_SIZE (eltype),
5196                                                  bitsize_int (0)));
5197               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5198               dst1 = make_ssa_name (vectype1);
5199               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5200                                                  build1 (VIEW_CONVERT_EXPR,
5201                                                          vectype1, tem));
5202               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5203               tem = make_ssa_name (eltype);
5204               epilog_stmt
5205                   = gimple_build_assign (tem, BIT_FIELD_REF,
5206                                          build3 (BIT_FIELD_REF, eltype,
5207                                                  new_temp, TYPE_SIZE (eltype),
5208                                                  bitsize_int (bitsize)));
5209               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5210               dst2 =  make_ssa_name (vectype1);
5211               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5212                                                  build1 (VIEW_CONVERT_EXPR,
5213                                                          vectype1, tem));
5214               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5215             }
5216
5217           new_temp = make_ssa_name (vectype1);
5218           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5219           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5220           new_phis[0] = epilog_stmt;
5221         }
5222
5223       if (reduce_with_shift && !slp_reduc)
5224         {
5225           int element_bitsize = tree_to_uhwi (bitsize);
5226           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5227              for variable-length vectors and also requires direct target support
5228              for loop reductions.  */
5229           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5230           int nelements = vec_size_in_bits / element_bitsize;
5231           vec_perm_builder sel;
5232           vec_perm_indices indices;
5233
5234           int elt_offset;
5235
5236           tree zero_vec = build_zero_cst (vectype1);
5237           /* Case 2: Create:
5238              for (offset = nelements/2; offset >= 1; offset/=2)
5239                 {
5240                   Create:  va' = vec_shift <va, offset>
5241                   Create:  va = vop <va, va'>
5242                 }  */
5243
5244           tree rhs;
5245
5246           if (dump_enabled_p ())
5247             dump_printf_loc (MSG_NOTE, vect_location,
5248                              "Reduce using vector shifts\n");
5249
5250           gimple_seq stmts = NULL;
5251           new_temp = gimple_convert (&stmts, vectype1, new_temp);
5252           for (elt_offset = nelements / 2;
5253                elt_offset >= 1;
5254                elt_offset /= 2)
5255             {
5256               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5257               indices.new_vector (sel, 2, nelements);
5258               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5259               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5260                                        new_temp, zero_vec, mask);
5261               new_temp = gimple_build (&stmts, code,
5262                                        vectype1, new_name, new_temp);
5263             }
5264           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5265
5266           /* 2.4  Extract the final scalar result.  Create:
5267              s_out3 = extract_field <v_out2, bitpos>  */
5268
5269           if (dump_enabled_p ())
5270             dump_printf_loc (MSG_NOTE, vect_location,
5271                              "extract scalar result\n");
5272
5273           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5274                         bitsize, bitsize_zero_node);
5275           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5276           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5277           gimple_assign_set_lhs (epilog_stmt, new_temp);
5278           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5279           scalar_results.safe_push (new_temp);
5280         }
5281       else
5282         {
5283           /* Case 3: Create:
5284              s = extract_field <v_out2, 0>
5285              for (offset = element_size;
5286                   offset < vector_size;
5287                   offset += element_size;)
5288                {
5289                  Create:  s' = extract_field <v_out2, offset>
5290                  Create:  s = op <s, s'>  // For non SLP cases
5291                }  */
5292
5293           if (dump_enabled_p ())
5294             dump_printf_loc (MSG_NOTE, vect_location,
5295                              "Reduce using scalar code.\n");
5296
5297           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5298           int element_bitsize = tree_to_uhwi (bitsize);
5299           tree compute_type = TREE_TYPE (vectype);
5300           gimple_seq stmts = NULL;
5301           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5302             {
5303               int bit_offset;
5304               if (gimple_code (new_phi) == GIMPLE_PHI)
5305                 vec_temp = PHI_RESULT (new_phi);
5306               else
5307                 vec_temp = gimple_assign_lhs (new_phi);
5308               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5309                                        vec_temp, bitsize, bitsize_zero_node);
5310
5311               /* In SLP we don't need to apply reduction operation, so we just
5312                  collect s' values in SCALAR_RESULTS.  */
5313               if (slp_reduc)
5314                 scalar_results.safe_push (new_temp);
5315
5316               for (bit_offset = element_bitsize;
5317                    bit_offset < vec_size_in_bits;
5318                    bit_offset += element_bitsize)
5319                 {
5320                   tree bitpos = bitsize_int (bit_offset);
5321                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
5322                                            compute_type, vec_temp,
5323                                            bitsize, bitpos);
5324                   if (slp_reduc)
5325                     {
5326                       /* In SLP we don't need to apply reduction operation, so
5327                          we just collect s' values in SCALAR_RESULTS.  */
5328                       new_temp = new_name;
5329                       scalar_results.safe_push (new_name);
5330                     }
5331                   else
5332                     new_temp = gimple_build (&stmts, code, compute_type,
5333                                              new_name, new_temp);
5334                 }
5335             }
5336
5337           /* The only case where we need to reduce scalar results in SLP, is
5338              unrolling.  If the size of SCALAR_RESULTS is greater than
5339              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5340              REDUC_GROUP_SIZE.  */
5341           if (slp_reduc)
5342             {
5343               tree res, first_res, new_res;
5344
5345               /* Reduce multiple scalar results in case of SLP unrolling.  */
5346               for (j = group_size; scalar_results.iterate (j, &res);
5347                    j++)
5348                 {
5349                   first_res = scalar_results[j % group_size];
5350                   new_res = gimple_build (&stmts, code, compute_type,
5351                                           first_res, res);
5352                   scalar_results[j % group_size] = new_res;
5353                 }
5354               for (k = 0; k < group_size; k++)
5355                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5356                                                     scalar_results[k]);
5357             }
5358           else
5359             {
5360               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5361               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5362               scalar_results.safe_push (new_temp);
5363             }
5364
5365           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5366         }
5367
5368       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5369           && induc_val)
5370         {
5371           /* Earlier we set the initial value to be a vector if induc_val
5372              values.  Check the result and if it is induc_val then replace
5373              with the original initial value, unless induc_val is
5374              the same as initial_def already.  */
5375           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5376                                   induc_val);
5377
5378           tree tmp = make_ssa_name (new_scalar_dest);
5379           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5380                                              initial_def, new_temp);
5381           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5382           scalar_results[0] = tmp;
5383         }
5384     }
5385
5386   /* 2.5 Adjust the final result by the initial value of the reduction
5387          variable. (When such adjustment is not needed, then
5388          'adjustment_def' is zero).  For example, if code is PLUS we create:
5389          new_temp = loop_exit_def + adjustment_def  */
5390
5391   if (adjustment_def)
5392     {
5393       gcc_assert (!slp_reduc);
5394       gimple_seq stmts = NULL;
5395       if (nested_in_vect_loop)
5396         {
5397           new_phi = new_phis[0];
5398           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5399           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5400           new_temp = gimple_build (&stmts, code, vectype,
5401                                    PHI_RESULT (new_phi), adjustment_def);
5402         }
5403       else
5404         {
5405           new_temp = scalar_results[0];
5406           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5407           adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5408           new_temp = gimple_build (&stmts, code, scalar_type,
5409                                    new_temp, adjustment_def);
5410         }
5411
5412       epilog_stmt = gimple_seq_last_stmt (stmts);
5413       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5414       if (nested_in_vect_loop)
5415         {
5416           stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5417           STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5418             = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5419
5420           if (!double_reduc)
5421             scalar_results.quick_push (new_temp);
5422           else
5423             scalar_results[0] = new_temp;
5424         }
5425       else
5426         scalar_results[0] = new_temp;
5427
5428       new_phis[0] = epilog_stmt;
5429     }
5430
5431   if (double_reduc)
5432     loop = loop->inner;
5433
5434   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5435           phis with new adjusted scalar results, i.e., replace use <s_out0>
5436           with use <s_out4>.
5437
5438      Transform:
5439         loop_exit:
5440           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5441           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5442           v_out2 = reduce <v_out1>
5443           s_out3 = extract_field <v_out2, 0>
5444           s_out4 = adjust_result <s_out3>
5445           use <s_out0>
5446           use <s_out0>
5447
5448      into:
5449
5450         loop_exit:
5451           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5452           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5453           v_out2 = reduce <v_out1>
5454           s_out3 = extract_field <v_out2, 0>
5455           s_out4 = adjust_result <s_out3>
5456           use <s_out4>
5457           use <s_out4> */
5458
5459
5460   /* In SLP reduction chain we reduce vector results into one vector if
5461      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5462      LHS of the last stmt in the reduction chain, since we are looking for
5463      the loop exit phi node.  */
5464   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5465     {
5466       stmt_vec_info dest_stmt_info
5467         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5468       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5469       group_size = 1;
5470     }
5471
5472   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5473      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5474      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5475      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5476      correspond to the first vector stmt, etc.
5477      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5478   if (group_size > new_phis.length ())
5479     gcc_assert (!(group_size % new_phis.length ()));
5480
5481   for (k = 0; k < group_size; k++)
5482     {
5483       if (slp_reduc)
5484         {
5485           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5486
5487           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5488           /* SLP statements can't participate in patterns.  */
5489           gcc_assert (!orig_stmt_info);
5490           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5491         }
5492
5493       if (nested_in_vect_loop)
5494         {
5495           if (double_reduc)
5496             loop = outer_loop;
5497           else
5498             gcc_unreachable ();
5499         }
5500
5501       phis.create (3);
5502       /* Find the loop-closed-use at the loop exit of the original scalar
5503          result.  (The reduction result is expected to have two immediate uses,
5504          one at the latch block, and one at the loop exit).  For double
5505          reductions we are looking for exit phis of the outer loop.  */
5506       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5507         {
5508           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5509             {
5510               if (!is_gimple_debug (USE_STMT (use_p)))
5511                 phis.safe_push (USE_STMT (use_p));
5512             }
5513           else
5514             {
5515               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5516                 {
5517                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5518
5519                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5520                     {
5521                       if (!flow_bb_inside_loop_p (loop,
5522                                              gimple_bb (USE_STMT (phi_use_p)))
5523                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5524                         phis.safe_push (USE_STMT (phi_use_p));
5525                     }
5526                 }
5527             }
5528         }
5529
5530       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5531         {
5532           /* Replace the uses:  */
5533           orig_name = PHI_RESULT (exit_phi);
5534           scalar_result = scalar_results[k];
5535           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5536             {
5537               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5538                 SET_USE (use_p, scalar_result);
5539               update_stmt (use_stmt);
5540             }
5541         }
5542
5543       phis.release ();
5544     }
5545 }
5546
5547 /* Return a vector of type VECTYPE that is equal to the vector select
5548    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5549    before GSI.  */
5550
5551 static tree
5552 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5553                      tree vec, tree identity)
5554 {
5555   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5556   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5557                                           mask, vec, identity);
5558   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5559   return cond;
5560 }
5561
5562 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5563    order, starting with LHS.  Insert the extraction statements before GSI and
5564    associate the new scalar SSA names with variable SCALAR_DEST.
5565    Return the SSA name for the result.  */
5566
5567 static tree
5568 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5569                        tree_code code, tree lhs, tree vector_rhs)
5570 {
5571   tree vectype = TREE_TYPE (vector_rhs);
5572   tree scalar_type = TREE_TYPE (vectype);
5573   tree bitsize = TYPE_SIZE (scalar_type);
5574   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5575   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5576
5577   for (unsigned HOST_WIDE_INT bit_offset = 0;
5578        bit_offset < vec_size_in_bits;
5579        bit_offset += element_bitsize)
5580     {
5581       tree bitpos = bitsize_int (bit_offset);
5582       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5583                          bitsize, bitpos);
5584
5585       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5586       rhs = make_ssa_name (scalar_dest, stmt);
5587       gimple_assign_set_lhs (stmt, rhs);
5588       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5589
5590       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5591       tree new_name = make_ssa_name (scalar_dest, stmt);
5592       gimple_assign_set_lhs (stmt, new_name);
5593       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5594       lhs = new_name;
5595     }
5596   return lhs;
5597 }
5598
5599 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
5600    type of the vector input.  */
5601
5602 static internal_fn
5603 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5604 {
5605   internal_fn mask_reduc_fn;
5606
5607   switch (reduc_fn)
5608     {
5609     case IFN_FOLD_LEFT_PLUS:
5610       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5611       break;
5612
5613     default:
5614       return IFN_LAST;
5615     }
5616
5617   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5618                                       OPTIMIZE_FOR_SPEED))
5619     return mask_reduc_fn;
5620   return IFN_LAST;
5621 }
5622
5623 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5624    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5625    statement.  CODE is the operation performed by STMT_INFO and OPS are
5626    its scalar operands.  REDUC_INDEX is the index of the operand in
5627    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5628    implements in-order reduction, or IFN_LAST if we should open-code it.
5629    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5630    that should be used to control the operation in a fully-masked loop.  */
5631
5632 static bool
5633 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5634                                gimple_stmt_iterator *gsi,
5635                                stmt_vec_info *vec_stmt, slp_tree slp_node,
5636                                gimple *reduc_def_stmt,
5637                                tree_code code, internal_fn reduc_fn,
5638                                tree ops[3], tree vectype_in,
5639                                int reduc_index, vec_loop_masks *masks)
5640 {
5641   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5642   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5643   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5644   stmt_vec_info new_stmt_info = NULL;
5645   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5646
5647   int ncopies;
5648   if (slp_node)
5649     ncopies = 1;
5650   else
5651     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5652
5653   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5654   gcc_assert (ncopies == 1);
5655   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5656
5657   if (slp_node)
5658     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5659                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5660
5661   tree op0 = ops[1 - reduc_index];
5662
5663   int group_size = 1;
5664   stmt_vec_info scalar_dest_def_info;
5665   auto_vec<tree> vec_oprnds0;
5666   if (slp_node)
5667     {
5668       auto_vec<vec<tree> > vec_defs (2);
5669       vect_get_slp_defs (slp_node, &vec_defs);
5670       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5671       vec_defs[0].release ();
5672       vec_defs[1].release ();
5673       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5674       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5675     }
5676   else
5677     {
5678       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5679       vec_oprnds0.create (1);
5680       vec_oprnds0.quick_push (loop_vec_def0);
5681       scalar_dest_def_info = stmt_info;
5682     }
5683
5684   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5685   tree scalar_type = TREE_TYPE (scalar_dest);
5686   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5687
5688   int vec_num = vec_oprnds0.length ();
5689   gcc_assert (vec_num == 1 || slp_node);
5690   tree vec_elem_type = TREE_TYPE (vectype_out);
5691   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5692
5693   tree vector_identity = NULL_TREE;
5694   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5695     vector_identity = build_zero_cst (vectype_out);
5696
5697   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5698   int i;
5699   tree def0;
5700   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5701     {
5702       gimple *new_stmt;
5703       tree mask = NULL_TREE;
5704       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5705         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5706
5707       /* Handle MINUS by adding the negative.  */
5708       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5709         {
5710           tree negated = make_ssa_name (vectype_out);
5711           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5712           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5713           def0 = negated;
5714         }
5715
5716       if (mask && mask_reduc_fn == IFN_LAST)
5717         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5718                                     vector_identity);
5719
5720       /* On the first iteration the input is simply the scalar phi
5721          result, and for subsequent iterations it is the output of
5722          the preceding operation.  */
5723       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
5724         {
5725           if (mask && mask_reduc_fn != IFN_LAST)
5726             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
5727                                                    def0, mask);
5728           else
5729             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
5730                                                    def0);
5731           /* For chained SLP reductions the output of the previous reduction
5732              operation serves as the input of the next. For the final statement
5733              the output cannot be a temporary - we reuse the original
5734              scalar destination of the last statement.  */
5735           if (i != vec_num - 1)
5736             {
5737               gimple_set_lhs (new_stmt, scalar_dest_var);
5738               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5739               gimple_set_lhs (new_stmt, reduc_var);
5740             }
5741         }
5742       else
5743         {
5744           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5745                                              reduc_var, def0);
5746           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5747           /* Remove the statement, so that we can use the same code paths
5748              as for statements that we've just created.  */
5749           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5750           gsi_remove (&tmp_gsi, true);
5751         }
5752
5753       if (i == vec_num - 1)
5754         {
5755           gimple_set_lhs (new_stmt, scalar_dest);
5756           new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5757                                                     new_stmt);
5758         }
5759       else
5760         new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5761                                                      new_stmt, gsi);
5762
5763       if (slp_node)
5764         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5765     }
5766
5767   if (!slp_node)
5768     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5769
5770   return true;
5771 }
5772
5773 /* Function is_nonwrapping_integer_induction.
5774
5775    Check if STMT_VINO (which is part of loop LOOP) both increments and
5776    does not cause overflow.  */
5777
5778 static bool
5779 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
5780 {
5781   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5782   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5783   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5784   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5785   widest_int ni, max_loop_value, lhs_max;
5786   wi::overflow_type overflow = wi::OVF_NONE;
5787
5788   /* Make sure the loop is integer based.  */
5789   if (TREE_CODE (base) != INTEGER_CST
5790       || TREE_CODE (step) != INTEGER_CST)
5791     return false;
5792
5793   /* Check that the max size of the loop will not wrap.  */
5794
5795   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5796     return true;
5797
5798   if (! max_stmt_executions (loop, &ni))
5799     return false;
5800
5801   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5802                             &overflow);
5803   if (overflow)
5804     return false;
5805
5806   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5807                             TYPE_SIGN (lhs_type), &overflow);
5808   if (overflow)
5809     return false;
5810
5811   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5812           <= TYPE_PRECISION (lhs_type));
5813 }
5814
5815 /* Check if masking can be supported by inserting a conditional expression.
5816    CODE is the code for the operation.  COND_FN is the conditional internal
5817    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
5818 static bool
5819 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
5820                          tree vectype_in)
5821 {
5822   if (cond_fn != IFN_LAST
5823       && direct_internal_fn_supported_p (cond_fn, vectype_in,
5824                                          OPTIMIZE_FOR_SPEED))
5825     return false;
5826
5827   switch (code)
5828     {
5829     case DOT_PROD_EXPR:
5830     case SAD_EXPR:
5831       return true;
5832
5833     default:
5834       return false;
5835     }
5836 }
5837
5838 /* Insert a conditional expression to enable masked vectorization.  CODE is the
5839    code for the operation.  VOP is the array of operands.  MASK is the loop
5840    mask.  GSI is a statement iterator used to place the new conditional
5841    expression.  */
5842 static void
5843 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
5844                       gimple_stmt_iterator *gsi)
5845 {
5846   switch (code)
5847     {
5848     case DOT_PROD_EXPR:
5849       {
5850         tree vectype = TREE_TYPE (vop[1]);
5851         tree zero = build_zero_cst (vectype);
5852         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5853         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5854                                                mask, vop[1], zero);
5855         gsi_insert_before (gsi, select, GSI_SAME_STMT);
5856         vop[1] = masked_op1;
5857         break;
5858       }
5859
5860     case SAD_EXPR:
5861       {
5862         tree vectype = TREE_TYPE (vop[1]);
5863         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5864         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5865                                                mask, vop[1], vop[0]);
5866         gsi_insert_before (gsi, select, GSI_SAME_STMT);
5867         vop[1] = masked_op1;
5868         break;
5869       }
5870
5871     default:
5872       gcc_unreachable ();
5873     }
5874 }
5875
5876 /* Function vectorizable_reduction.
5877
5878    Check if STMT_INFO performs a reduction operation that can be vectorized.
5879    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5880    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5881    Return true if STMT_INFO is vectorizable in this way.
5882
5883    This function also handles reduction idioms (patterns) that have been
5884    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
5885    may be of this form:
5886      X = pattern_expr (arg0, arg1, ..., X)
5887    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5888    sequence that had been detected and replaced by the pattern-stmt
5889    (STMT_INFO).
5890
5891    This function also handles reduction of condition expressions, for example:
5892      for (int i = 0; i < N; i++)
5893        if (a[i] < value)
5894          last = a[i];
5895    This is handled by vectorising the loop and creating an additional vector
5896    containing the loop indexes for which "a[i] < value" was true.  In the
5897    function epilogue this is reduced to a single max value and then used to
5898    index into the vector of results.
5899
5900    In some cases of reduction patterns, the type of the reduction variable X is
5901    different than the type of the other arguments of STMT_INFO.
5902    In such cases, the vectype that is used when transforming STMT_INFO into
5903    a vector stmt is different than the vectype that is used to determine the
5904    vectorization factor, because it consists of a different number of elements
5905    than the actual number of elements that are being operated upon in parallel.
5906
5907    For example, consider an accumulation of shorts into an int accumulator.
5908    On some targets it's possible to vectorize this pattern operating on 8
5909    shorts at a time (hence, the vectype for purposes of determining the
5910    vectorization factor should be V8HI); on the other hand, the vectype that
5911    is used to create the vector form is actually V4SI (the type of the result).
5912
5913    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5914    indicates what is the actual level of parallelism (V8HI in the example), so
5915    that the right vectorization factor would be derived.  This vectype
5916    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5917    be used to create the vectorized stmt.  The right vectype for the vectorized
5918    stmt is obtained from the type of the result X:
5919       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
5920
5921    This means that, contrary to "regular" reductions (or "regular" stmts in
5922    general), the following equation:
5923       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
5924    does *NOT* necessarily hold for reduction patterns.  */
5925
5926 bool
5927 vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
5928                         slp_instance slp_node_instance,
5929                         stmt_vector_for_cost *cost_vec)
5930 {
5931   tree scalar_dest;
5932   tree vectype_in = NULL_TREE;
5933   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5934   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5935   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
5936   stmt_vec_info cond_stmt_vinfo = NULL;
5937   tree scalar_type;
5938   int i;
5939   int ncopies;
5940   bool single_defuse_cycle = false;
5941   bool nested_cycle = false;
5942   bool double_reduc = false;
5943   int vec_num;
5944   tree tem;
5945   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5946   tree cond_reduc_val = NULL_TREE;
5947
5948   /* Make sure it was already recognized as a reduction computation.  */
5949   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
5950       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
5951       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
5952     return false;
5953
5954   /* The stmt we store reduction analysis meta on.  */
5955   stmt_vec_info reduc_info = info_for_reduction (stmt_info);
5956   reduc_info->is_reduc_info = true;
5957
5958   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5959     {
5960       if (is_a <gphi *> (stmt_info->stmt))
5961         /* Analysis for double-reduction is done on the outer
5962            loop PHI, nested cycles have no further restrictions.  */
5963         STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
5964       else
5965         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5966       return true;
5967     }
5968
5969   stmt_vec_info orig_stmt_of_analysis = stmt_info;
5970   stmt_vec_info phi_info = stmt_info;
5971   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
5972       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5973     {
5974       if (!is_a <gphi *> (stmt_info->stmt))
5975         {
5976           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5977           return true;
5978         }
5979       if (slp_node)
5980         {
5981           slp_node_instance->reduc_phis = slp_node;
5982           /* ???  We're leaving slp_node to point to the PHIs, we only
5983              need it to get at the number of vector stmts which wasn't
5984              yet initialized for the instance root.  */
5985         }
5986       if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5987         stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
5988       else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
5989         {
5990           use_operand_p use_p;
5991           gimple *use_stmt;
5992           bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
5993                                      &use_p, &use_stmt);
5994           gcc_assert (res);
5995           phi_info = loop_vinfo->lookup_stmt (use_stmt);
5996           stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
5997         }
5998     }
5999
6000   /* PHIs should not participate in patterns.  */
6001   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6002   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6003
6004   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6005      and compute the reduction chain length.  */
6006   tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6007                                           loop_latch_edge (loop));
6008   unsigned reduc_chain_length = 0;
6009   bool only_slp_reduc_chain = true;
6010   stmt_info = NULL;
6011   while (reduc_def != PHI_RESULT (reduc_def_phi))
6012     {
6013       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6014       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6015       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6016         {
6017           if (dump_enabled_p ())
6018             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6019                              "reduction chain broken by patterns.\n");
6020           return false;
6021         }
6022       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6023         only_slp_reduc_chain = false;
6024       /* ???  For epilogue generation live members of the chain need
6025          to point back to the PHI via their original stmt for
6026          info_for_reduction to work.  */
6027       if (STMT_VINFO_LIVE_P (vdef))
6028         STMT_VINFO_REDUC_DEF (def) = phi_info;
6029       if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (vdef->stmt)))
6030         {
6031           if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (vdef->stmt)),
6032                                       TREE_TYPE (gimple_assign_rhs1 (vdef->stmt))))
6033             {
6034               if (dump_enabled_p ())
6035                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6036                                  "conversion in the reduction chain.\n");
6037               return false;
6038             }
6039         }
6040       else if (!stmt_info)
6041         /* First non-conversion stmt.  */
6042         stmt_info = vdef;
6043       reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6044       reduc_chain_length++;
6045     }
6046   /* PHIs should not participate in patterns.  */
6047   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6048
6049   if (nested_in_vect_loop_p (loop, stmt_info))
6050     {
6051       loop = loop->inner;
6052       nested_cycle = true;
6053     }
6054
6055   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6056      element.  */
6057   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6058     {
6059       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6060       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6061     }
6062   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6063     gcc_assert (slp_node
6064                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6065
6066   /* 1. Is vectorizable reduction?  */
6067   /* Not supportable if the reduction variable is used in the loop, unless
6068      it's a reduction chain.  */
6069   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6070       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6071     return false;
6072
6073   /* Reductions that are not used even in an enclosing outer-loop,
6074      are expected to be "live" (used out of the loop).  */
6075   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6076       && !STMT_VINFO_LIVE_P (stmt_info))
6077     return false;
6078
6079   /* 2. Has this been recognized as a reduction pattern?
6080
6081      Check if STMT represents a pattern that has been recognized
6082      in earlier analysis stages.  For stmts that represent a pattern,
6083      the STMT_VINFO_RELATED_STMT field records the last stmt in
6084      the original sequence that constitutes the pattern.  */
6085
6086   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6087   if (orig_stmt_info)
6088     {
6089       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6090       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6091     }
6092
6093   /* 3. Check the operands of the operation.  The first operands are defined
6094         inside the loop body. The last operand is the reduction variable,
6095         which is defined by the loop-header-phi.  */
6096
6097   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6098   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6099   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6100   enum tree_code code = gimple_assign_rhs_code (stmt);
6101   bool lane_reduc_code_p
6102     = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6103   int op_type = TREE_CODE_LENGTH (code);
6104
6105   scalar_dest = gimple_assign_lhs (stmt);
6106   scalar_type = TREE_TYPE (scalar_dest);
6107   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6108       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6109     return false;
6110
6111   /* Do not try to vectorize bit-precision reductions.  */
6112   if (!type_has_mode_precision_p (scalar_type))
6113     return false;
6114
6115   /* For lane-reducing ops we're reducing the number of reduction PHIs
6116      which means the only use of that may be in the lane-reducing operation.  */
6117   if (lane_reduc_code_p
6118       && reduc_chain_length != 1
6119       && !only_slp_reduc_chain)
6120     {
6121       if (dump_enabled_p ())
6122         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6123                          "lane-reducing reduction with extra stmts.\n");
6124       return false;
6125     }
6126
6127   /* All uses but the last are expected to be defined in the loop.
6128      The last use is the reduction variable.  In case of nested cycle this
6129      assumption is not true: we use reduc_index to record the index of the
6130      reduction variable.  */
6131   reduc_def = PHI_RESULT (reduc_def_phi);
6132   for (i = 0; i < op_type; i++)
6133     {
6134       tree op = gimple_op (stmt, i + 1);
6135       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6136       if (i == 0 && code == COND_EXPR)
6137         continue;
6138
6139       stmt_vec_info def_stmt_info;
6140       enum vect_def_type dt;
6141       if (!vect_is_simple_use (op, loop_vinfo, &dt, &tem,
6142                                &def_stmt_info))
6143         {
6144           if (dump_enabled_p ())
6145             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6146                              "use not simple.\n");
6147           return false;
6148         }
6149       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6150         continue;
6151
6152       /* There should be only one cycle def in the stmt, the one
6153          leading to reduc_def.  */
6154       if (VECTORIZABLE_CYCLE_DEF (dt))
6155         return false;
6156
6157       /* To properly compute ncopies we are interested in the widest
6158          non-reduction input type in case we're looking at a widening
6159          accumulation that we later handle in vect_transform_reduction.  */
6160       if (lane_reduc_code_p
6161           && tem
6162           && (!vectype_in
6163               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6164                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6165         vectype_in = tem;
6166
6167       if (code == COND_EXPR)
6168         {
6169           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
6170           if (dt == vect_constant_def)
6171             {
6172               cond_reduc_dt = dt;
6173               cond_reduc_val = op;
6174             }
6175           if (dt == vect_induction_def
6176               && def_stmt_info
6177               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6178             {
6179               cond_reduc_dt = dt;
6180               cond_stmt_vinfo = def_stmt_info;
6181             }
6182         }
6183     }
6184   if (!vectype_in)
6185     vectype_in = STMT_VINFO_VECTYPE (phi_info);
6186   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6187
6188   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6189   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6190   /* If we have a condition reduction, see if we can simplify it further.  */
6191   if (v_reduc_type == COND_REDUCTION)
6192     {
6193       if (slp_node)
6194         return false;
6195
6196       /* When the condition uses the reduction value in the condition, fail.  */
6197       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6198         {
6199           if (dump_enabled_p ())
6200             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6201                              "condition depends on previous iteration\n");
6202           return false;
6203         }
6204
6205       if (reduc_chain_length == 1
6206           && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6207                                              vectype_in, OPTIMIZE_FOR_SPEED))
6208         {
6209           if (dump_enabled_p ())
6210             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6211                              "optimizing condition reduction with"
6212                              " FOLD_EXTRACT_LAST.\n");
6213           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6214         }
6215       else if (cond_reduc_dt == vect_induction_def)
6216         {
6217           tree base
6218             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6219           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6220
6221           gcc_assert (TREE_CODE (base) == INTEGER_CST
6222                       && TREE_CODE (step) == INTEGER_CST);
6223           cond_reduc_val = NULL_TREE;
6224           enum tree_code cond_reduc_op_code = ERROR_MARK;
6225           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6226           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6227             ;
6228           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6229              above base; punt if base is the minimum value of the type for
6230              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6231           else if (tree_int_cst_sgn (step) == -1)
6232             {
6233               cond_reduc_op_code = MIN_EXPR;
6234               if (tree_int_cst_sgn (base) == -1)
6235                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6236               else if (tree_int_cst_lt (base,
6237                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6238                 cond_reduc_val
6239                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6240             }
6241           else
6242             {
6243               cond_reduc_op_code = MAX_EXPR;
6244               if (tree_int_cst_sgn (base) == 1)
6245                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6246               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6247                                         base))
6248                 cond_reduc_val
6249                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6250             }
6251           if (cond_reduc_val)
6252             {
6253               if (dump_enabled_p ())
6254                 dump_printf_loc (MSG_NOTE, vect_location,
6255                                  "condition expression based on "
6256                                  "integer induction.\n");
6257               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6258               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6259                 = cond_reduc_val;
6260               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6261             }
6262         }
6263       else if (cond_reduc_dt == vect_constant_def)
6264         {
6265           enum vect_def_type cond_initial_dt;
6266           tree cond_initial_val
6267             = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6268
6269           gcc_assert (cond_reduc_val != NULL_TREE);
6270           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6271           if (cond_initial_dt == vect_constant_def
6272               && types_compatible_p (TREE_TYPE (cond_initial_val),
6273                                      TREE_TYPE (cond_reduc_val)))
6274             {
6275               tree e = fold_binary (LE_EXPR, boolean_type_node,
6276                                     cond_initial_val, cond_reduc_val);
6277               if (e && (integer_onep (e) || integer_zerop (e)))
6278                 {
6279                   if (dump_enabled_p ())
6280                     dump_printf_loc (MSG_NOTE, vect_location,
6281                                      "condition expression based on "
6282                                      "compile time constant.\n");
6283                   /* Record reduction code at analysis stage.  */
6284                   STMT_VINFO_REDUC_CODE (reduc_info)
6285                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6286                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6287                 }
6288             }
6289         }
6290     }
6291
6292   if (STMT_VINFO_LIVE_P (phi_info))
6293     return false;
6294
6295   if (slp_node)
6296     ncopies = 1;
6297   else
6298     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6299
6300   gcc_assert (ncopies >= 1);
6301
6302   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6303
6304   if (nested_cycle)
6305     {
6306       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6307                   == vect_double_reduction_def);
6308       double_reduc = true;
6309     }
6310
6311   /* 4.2. Check support for the epilog operation.
6312
6313           If STMT represents a reduction pattern, then the type of the
6314           reduction variable may be different than the type of the rest
6315           of the arguments.  For example, consider the case of accumulation
6316           of shorts into an int accumulator; The original code:
6317                         S1: int_a = (int) short_a;
6318           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6319
6320           was replaced with:
6321                         STMT: int_acc = widen_sum <short_a, int_acc>
6322
6323           This means that:
6324           1. The tree-code that is used to create the vector operation in the
6325              epilog code (that reduces the partial results) is not the
6326              tree-code of STMT, but is rather the tree-code of the original
6327              stmt from the pattern that STMT is replacing.  I.e, in the example
6328              above we want to use 'widen_sum' in the loop, but 'plus' in the
6329              epilog.
6330           2. The type (mode) we use to check available target support
6331              for the vector operation to be created in the *epilog*, is
6332              determined by the type of the reduction variable (in the example
6333              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6334              However the type (mode) we use to check available target support
6335              for the vector operation to be created *inside the loop*, is
6336              determined by the type of the other arguments to STMT (in the
6337              example we'd check this: optab_handler (widen_sum_optab,
6338              vect_short_mode)).
6339
6340           This is contrary to "regular" reductions, in which the types of all
6341           the arguments are the same as the type of the reduction variable.
6342           For "regular" reductions we can therefore use the same vector type
6343           (and also the same tree-code) when generating the epilog code and
6344           when generating the code inside the loop.  */
6345
6346   enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6347   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6348
6349   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6350   if (reduction_type == TREE_CODE_REDUCTION)
6351     {
6352       /* Check whether it's ok to change the order of the computation.
6353          Generally, when vectorizing a reduction we change the order of the
6354          computation.  This may change the behavior of the program in some
6355          cases, so we need to check that this is ok.  One exception is when
6356          vectorizing an outer-loop: the inner-loop is executed sequentially,
6357          and therefore vectorizing reductions in the inner-loop during
6358          outer-loop vectorization is safe.  */
6359       if (needs_fold_left_reduction_p (scalar_type, orig_code))
6360         {
6361           /* When vectorizing a reduction chain w/o SLP the reduction PHI
6362              is not directy used in stmt.  */
6363           if (!only_slp_reduc_chain
6364               && reduc_chain_length != 1)
6365             {
6366               if (dump_enabled_p ())
6367                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6368                                  "in-order reduction chain without SLP.\n");
6369               return false;
6370             }
6371           STMT_VINFO_REDUC_TYPE (reduc_info)
6372             = reduction_type = FOLD_LEFT_REDUCTION;
6373         }
6374       else if (!commutative_tree_code (orig_code)
6375                || !associative_tree_code (orig_code))
6376         {
6377           if (dump_enabled_p ())
6378             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6379                             "reduction: not commutative/associative");
6380           return false;
6381         }
6382     }
6383
6384   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6385       && ncopies > 1)
6386     {
6387       if (dump_enabled_p ())
6388         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6389                          "multiple types in double reduction or condition "
6390                          "reduction or fold-left reduction.\n");
6391       return false;
6392     }
6393
6394   internal_fn reduc_fn = IFN_LAST;
6395   if (reduction_type == TREE_CODE_REDUCTION
6396       || reduction_type == FOLD_LEFT_REDUCTION
6397       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6398       || reduction_type == CONST_COND_REDUCTION)
6399     {
6400       if (reduction_type == FOLD_LEFT_REDUCTION
6401           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6402           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6403         {
6404           if (reduc_fn != IFN_LAST
6405               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6406                                                   OPTIMIZE_FOR_SPEED))
6407             {
6408               if (dump_enabled_p ())
6409                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6410                                  "reduc op not supported by target.\n");
6411
6412               reduc_fn = IFN_LAST;
6413             }
6414         }
6415       else
6416         {
6417           if (!nested_cycle || double_reduc)
6418             {
6419               if (dump_enabled_p ())
6420                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6421                                  "no reduc code for scalar code.\n");
6422
6423               return false;
6424             }
6425         }
6426     }
6427   else if (reduction_type == COND_REDUCTION)
6428     {
6429       int scalar_precision
6430         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6431       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6432       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6433                                                 nunits_out);
6434
6435       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6436                                           OPTIMIZE_FOR_SPEED))
6437         reduc_fn = IFN_REDUC_MAX;
6438     }
6439   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6440
6441   if (reduction_type != EXTRACT_LAST_REDUCTION
6442       && (!nested_cycle || double_reduc)
6443       && reduc_fn == IFN_LAST
6444       && !nunits_out.is_constant ())
6445     {
6446       if (dump_enabled_p ())
6447         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6448                          "missing target support for reduction on"
6449                          " variable-length vectors.\n");
6450       return false;
6451     }
6452
6453   /* For SLP reductions, see if there is a neutral value we can use.  */
6454   tree neutral_op = NULL_TREE;
6455   if (slp_node)
6456     neutral_op = neutral_op_for_slp_reduction
6457       (slp_node_instance->reduc_phis, vectype_out, orig_code,
6458        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6459
6460   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6461     {
6462       /* We can't support in-order reductions of code such as this:
6463
6464            for (int i = 0; i < n1; ++i)
6465              for (int j = 0; j < n2; ++j)
6466                l += a[j];
6467
6468          since GCC effectively transforms the loop when vectorizing:
6469
6470            for (int i = 0; i < n1 / VF; ++i)
6471              for (int j = 0; j < n2; ++j)
6472                for (int k = 0; k < VF; ++k)
6473                  l += a[j];
6474
6475          which is a reassociation of the original operation.  */
6476       if (dump_enabled_p ())
6477         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6478                          "in-order double reduction not supported.\n");
6479
6480       return false;
6481     }
6482
6483   if (reduction_type == FOLD_LEFT_REDUCTION
6484       && slp_node
6485       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6486     {
6487       /* We cannot use in-order reductions in this case because there is
6488          an implicit reassociation of the operations involved.  */
6489       if (dump_enabled_p ())
6490         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6491                          "in-order unchained SLP reductions not supported.\n");
6492       return false;
6493     }
6494
6495   /* For double reductions, and for SLP reductions with a neutral value,
6496      we construct a variable-length initial vector by loading a vector
6497      full of the neutral value and then shift-and-inserting the start
6498      values into the low-numbered elements.  */
6499   if ((double_reduc || neutral_op)
6500       && !nunits_out.is_constant ()
6501       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6502                                           vectype_out, OPTIMIZE_FOR_SPEED))
6503     {
6504       if (dump_enabled_p ())
6505         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6506                          "reduction on variable-length vectors requires"
6507                          " target support for a vector-shift-and-insert"
6508                          " operation.\n");
6509       return false;
6510     }
6511
6512   /* Check extra constraints for variable-length unchained SLP reductions.  */
6513   if (STMT_SLP_TYPE (stmt_info)
6514       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6515       && !nunits_out.is_constant ())
6516     {
6517       /* We checked above that we could build the initial vector when
6518          there's a neutral element value.  Check here for the case in
6519          which each SLP statement has its own initial value and in which
6520          that value needs to be repeated for every instance of the
6521          statement within the initial vector.  */
6522       unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
6523       if (!neutral_op
6524           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6525                                               TREE_TYPE (vectype_out)))
6526         {
6527           if (dump_enabled_p ())
6528             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6529                              "unsupported form of SLP reduction for"
6530                              " variable-length vectors: cannot build"
6531                              " initial vector.\n");
6532           return false;
6533         }
6534       /* The epilogue code relies on the number of elements being a multiple
6535          of the group size.  The duplicate-and-interleave approach to setting
6536          up the the initial vector does too.  */
6537       if (!multiple_p (nunits_out, group_size))
6538         {
6539           if (dump_enabled_p ())
6540             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6541                              "unsupported form of SLP reduction for"
6542                              " variable-length vectors: the vector size"
6543                              " is not a multiple of the number of results.\n");
6544           return false;
6545         }
6546     }
6547
6548   if (reduction_type == COND_REDUCTION)
6549     {
6550       widest_int ni;
6551
6552       if (! max_loop_iterations (loop, &ni))
6553         {
6554           if (dump_enabled_p ())
6555             dump_printf_loc (MSG_NOTE, vect_location,
6556                              "loop count not known, cannot create cond "
6557                              "reduction.\n");
6558           return false;
6559         }
6560       /* Convert backedges to iterations.  */
6561       ni += 1;
6562
6563       /* The additional index will be the same type as the condition.  Check
6564          that the loop can fit into this less one (because we'll use up the
6565          zero slot for when there are no matches).  */
6566       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6567       if (wi::geu_p (ni, wi::to_widest (max_index)))
6568         {
6569           if (dump_enabled_p ())
6570             dump_printf_loc (MSG_NOTE, vect_location,
6571                              "loop size is greater than data size.\n");
6572           return false;
6573         }
6574     }
6575
6576   /* In case the vectorization factor (VF) is bigger than the number
6577      of elements that we can fit in a vectype (nunits), we have to generate
6578      more than one vector stmt - i.e - we need to "unroll" the
6579      vector stmt by a factor VF/nunits.  For more details see documentation
6580      in vectorizable_operation.  */
6581
6582   /* If the reduction is used in an outer loop we need to generate
6583      VF intermediate results, like so (e.g. for ncopies=2):
6584         r0 = phi (init, r0)
6585         r1 = phi (init, r1)
6586         r0 = x0 + r0;
6587         r1 = x1 + r1;
6588     (i.e. we generate VF results in 2 registers).
6589     In this case we have a separate def-use cycle for each copy, and therefore
6590     for each copy we get the vector def for the reduction variable from the
6591     respective phi node created for this copy.
6592
6593     Otherwise (the reduction is unused in the loop nest), we can combine
6594     together intermediate results, like so (e.g. for ncopies=2):
6595         r = phi (init, r)
6596         r = x0 + r;
6597         r = x1 + r;
6598    (i.e. we generate VF/2 results in a single register).
6599    In this case for each copy we get the vector def for the reduction variable
6600    from the vectorized reduction operation generated in the previous iteration.
6601
6602    This only works when we see both the reduction PHI and its only consumer
6603    in vectorizable_reduction and there are no intermediate stmts
6604    participating.  */
6605   if (ncopies > 1
6606       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6607       && reduc_chain_length == 1)
6608     single_defuse_cycle = true;
6609
6610   if (single_defuse_cycle || lane_reduc_code_p)
6611     {
6612       gcc_assert (code != COND_EXPR);
6613
6614       /* 4. Supportable by target?  */
6615       bool ok = true;
6616
6617       /* 4.1. check support for the operation in the loop  */
6618       optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
6619       if (!optab)
6620         {
6621           if (dump_enabled_p ())
6622             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6623                              "no optab.\n");
6624           ok = false;
6625         }
6626
6627       machine_mode vec_mode = TYPE_MODE (vectype_in);
6628       if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6629         {
6630           if (dump_enabled_p ())
6631             dump_printf (MSG_NOTE, "op not supported by target.\n");
6632           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6633               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6634             ok = false;
6635           else
6636             if (dump_enabled_p ())
6637               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6638         }
6639
6640       /* Worthwhile without SIMD support?  */
6641       if (ok
6642           && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
6643           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6644         {
6645           if (dump_enabled_p ())
6646             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6647                              "not worthwhile without SIMD support.\n");
6648           ok = false;
6649         }
6650
6651       /* lane-reducing operations have to go through vect_transform_reduction.
6652          For the other cases try without the single cycle optimization.  */
6653       if (!ok)
6654         {
6655           if (lane_reduc_code_p)
6656             return false;
6657           else
6658             single_defuse_cycle = false;
6659         }
6660     }
6661   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
6662
6663   /* If the reduction stmt is one of the patterns that have lane
6664      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6665   if ((ncopies > 1 && ! single_defuse_cycle)
6666       && lane_reduc_code_p)
6667     {
6668       if (dump_enabled_p ())
6669         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6670                          "multi def-use cycle not possible for lane-reducing "
6671                          "reduction operation\n");
6672       return false;
6673     }
6674
6675   if (slp_node)
6676     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6677   else
6678     vec_num = 1;
6679
6680   vect_model_reduction_cost (stmt_info, reduc_fn, reduction_type, ncopies,
6681                              cost_vec);
6682   if (dump_enabled_p ()
6683       && reduction_type == FOLD_LEFT_REDUCTION)
6684     dump_printf_loc (MSG_NOTE, vect_location,
6685                      "using an in-order (fold-left) reduction.\n");
6686   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
6687   /* All but single defuse-cycle optimized, lane-reducing and fold-left
6688      reductions go through their own vectorizable_* routines.  */
6689   if (!single_defuse_cycle
6690       && code != DOT_PROD_EXPR
6691       && code != WIDEN_SUM_EXPR
6692       && code != SAD_EXPR
6693       && reduction_type != FOLD_LEFT_REDUCTION)
6694     {
6695       stmt_vec_info tem
6696         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6697       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
6698         {
6699           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
6700           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
6701         }
6702       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
6703       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
6704     }
6705   else if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6706     {
6707       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6708       internal_fn cond_fn = get_conditional_internal_fn (code);
6709
6710       if (reduction_type != FOLD_LEFT_REDUCTION
6711           && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
6712           && (cond_fn == IFN_LAST
6713               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6714                                                   OPTIMIZE_FOR_SPEED)))
6715         {
6716           if (dump_enabled_p ())
6717             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6718                              "can't use a fully-masked loop because no"
6719                              " conditional operation is available.\n");
6720           LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6721         }
6722       else if (reduction_type == FOLD_LEFT_REDUCTION
6723                && reduc_fn == IFN_LAST
6724                && !expand_vec_cond_expr_p (vectype_in,
6725                                            truth_type_for (vectype_in),
6726                                            SSA_NAME))
6727         {
6728           if (dump_enabled_p ())
6729             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6730                              "can't use a fully-masked loop because no"
6731                              " conditional operation is available.\n");
6732           LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6733         }
6734       else
6735         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6736                                vectype_in, NULL);
6737     }
6738   return true;
6739 }
6740
6741 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
6742    value.  */
6743
6744 bool
6745 vect_transform_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6746                           stmt_vec_info *vec_stmt, slp_tree slp_node)
6747 {
6748   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6749   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6750   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6751   int i;
6752   int ncopies;
6753   int j;
6754   int vec_num;
6755
6756   stmt_vec_info reduc_info = info_for_reduction (stmt_info);
6757   gcc_assert (reduc_info->is_reduc_info);
6758
6759   if (nested_in_vect_loop_p (loop, stmt_info))
6760     {
6761       loop = loop->inner;
6762       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
6763     }
6764
6765   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6766   enum tree_code code = gimple_assign_rhs_code (stmt);
6767   int op_type = TREE_CODE_LENGTH (code);
6768
6769   /* Flatten RHS.  */
6770   tree ops[3];
6771   switch (get_gimple_rhs_class (code))
6772     {
6773     case GIMPLE_TERNARY_RHS:
6774       ops[2] = gimple_assign_rhs3 (stmt);
6775       /* Fall thru.  */
6776     case GIMPLE_BINARY_RHS:
6777       ops[0] = gimple_assign_rhs1 (stmt);
6778       ops[1] = gimple_assign_rhs2 (stmt);
6779       break;
6780     default:
6781       gcc_unreachable ();
6782     }
6783
6784   /* All uses but the last are expected to be defined in the loop.
6785      The last use is the reduction variable.  In case of nested cycle this
6786      assumption is not true: we use reduc_index to record the index of the
6787      reduction variable.  */
6788   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
6789   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6790   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
6791   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
6792
6793   if (slp_node)
6794     {
6795       ncopies = 1;
6796       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6797     }
6798   else
6799     {
6800       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6801       vec_num = 1;
6802     }
6803
6804   internal_fn cond_fn = get_conditional_internal_fn (code);
6805   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6806   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
6807
6808   /* Transform.  */
6809   stmt_vec_info new_stmt_info = NULL;
6810   stmt_vec_info prev_stmt_info;
6811   tree new_temp = NULL_TREE;
6812   auto_vec<tree> vec_oprnds0;
6813   auto_vec<tree> vec_oprnds1;
6814   auto_vec<tree> vec_oprnds2;
6815   tree def0;
6816
6817   if (dump_enabled_p ())
6818     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6819
6820   /* FORNOW: Multiple types are not supported for condition.  */
6821   if (code == COND_EXPR)
6822     gcc_assert (ncopies == 1);
6823
6824   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6825
6826   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6827   if (reduction_type == FOLD_LEFT_REDUCTION)
6828     {
6829       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
6830       return vectorize_fold_left_reduction
6831           (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6832            reduc_fn, ops, vectype_in, reduc_index, masks);
6833     }
6834
6835   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
6836   gcc_assert (single_defuse_cycle
6837               || code == DOT_PROD_EXPR
6838               || code == WIDEN_SUM_EXPR
6839               || code == SAD_EXPR);
6840
6841   /* Create the destination vector  */
6842   tree scalar_dest = gimple_assign_lhs (stmt);
6843   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6844
6845   prev_stmt_info = NULL;
6846   if (!slp_node)
6847     {
6848       vec_oprnds0.create (1);
6849       vec_oprnds1.create (1);
6850       if (op_type == ternary_op)
6851         vec_oprnds2.create (1);
6852     }
6853
6854   for (j = 0; j < ncopies; j++)
6855     {
6856       /* Handle uses.  */
6857       if (j == 0)
6858         {
6859           if (slp_node)
6860             {
6861               /* Get vec defs for all the operands except the reduction index,
6862                  ensuring the ordering of the ops in the vector is kept.  */
6863               auto_vec<vec<tree>, 3> vec_defs;
6864               vect_get_slp_defs (slp_node, &vec_defs);
6865               vec_oprnds0.safe_splice (vec_defs[0]);
6866               vec_defs[0].release ();
6867               vec_oprnds1.safe_splice (vec_defs[1]);
6868               vec_defs[1].release ();
6869               if (op_type == ternary_op)
6870                 {
6871                   vec_oprnds2.safe_splice (vec_defs[2]);
6872                   vec_defs[2].release ();
6873                 }
6874             }
6875           else
6876             {
6877               vec_oprnds0.quick_push
6878                 (vect_get_vec_def_for_operand (ops[0], stmt_info));
6879               vec_oprnds1.quick_push
6880                 (vect_get_vec_def_for_operand (ops[1], stmt_info));
6881               if (op_type == ternary_op)
6882                 vec_oprnds2.quick_push
6883                   (vect_get_vec_def_for_operand (ops[2], stmt_info));
6884             }
6885         }
6886       else
6887         {
6888           if (!slp_node)
6889             {
6890               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6891
6892               if (single_defuse_cycle && reduc_index == 0)
6893                 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
6894               else
6895                 vec_oprnds0[0]
6896                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6897                                                     vec_oprnds0[0]);
6898               if (single_defuse_cycle && reduc_index == 1)
6899                 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
6900               else
6901                 vec_oprnds1[0]
6902                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6903                                                     vec_oprnds1[0]);
6904               if (op_type == ternary_op)
6905                 {
6906                   if (single_defuse_cycle && reduc_index == 2)
6907                     vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
6908                   else
6909                     vec_oprnds2[0]
6910                       = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6911                                                         vec_oprnds2[0]);
6912                 }
6913             }
6914         }
6915
6916       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6917         {
6918           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6919           if (masked_loop_p && !mask_by_cond_expr)
6920             {
6921               /* Make sure that the reduction accumulator is vop[0].  */
6922               if (reduc_index == 1)
6923                 {
6924                   gcc_assert (commutative_tree_code (code));
6925                   std::swap (vop[0], vop[1]);
6926                 }
6927               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
6928                                               vectype_in, i * ncopies + j);
6929               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
6930                                                         vop[0], vop[1],
6931                                                         vop[0]);
6932               new_temp = make_ssa_name (vec_dest, call);
6933               gimple_call_set_lhs (call, new_temp);
6934               gimple_call_set_nothrow (call, true);
6935               new_stmt_info
6936                 = vect_finish_stmt_generation (stmt_info, call, gsi);
6937             }
6938           else
6939             {
6940               if (op_type == ternary_op)
6941                 vop[2] = vec_oprnds2[i];
6942
6943               if (masked_loop_p && mask_by_cond_expr)
6944                 {
6945                   tree mask = vect_get_loop_mask (gsi, masks,
6946                                                   vec_num * ncopies,
6947                                                   vectype_in, i * ncopies + j);
6948                   build_vect_cond_expr (code, vop, mask, gsi);
6949                 }
6950
6951               gassign *new_stmt = gimple_build_assign (vec_dest, code,
6952                                                        vop[0], vop[1], vop[2]);
6953               new_temp = make_ssa_name (vec_dest, new_stmt);
6954               gimple_assign_set_lhs (new_stmt, new_temp);
6955               new_stmt_info
6956                 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
6957             }
6958
6959           if (slp_node)
6960             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
6961         }
6962
6963       if (slp_node || single_defuse_cycle)
6964         continue;
6965
6966       if (j == 0)
6967         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
6968       else
6969         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6970
6971       prev_stmt_info = new_stmt_info;
6972     }
6973
6974   if (single_defuse_cycle && !slp_node)
6975     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
6976
6977   return true;
6978 }
6979
6980 /* Transform phase of a cycle PHI.  */
6981
6982 bool
6983 vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
6984                           slp_tree slp_node, slp_instance slp_node_instance)
6985 {
6986   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6987   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6988   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6989   int i;
6990   int ncopies;
6991   stmt_vec_info prev_phi_info;
6992   int j;
6993   bool nested_cycle = false;
6994   int vec_num;
6995
6996   if (nested_in_vect_loop_p (loop, stmt_info))
6997     {
6998       loop = loop->inner;
6999       nested_cycle = true;
7000     }
7001
7002   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7003   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7004   stmt_vec_info reduc_info = info_for_reduction (stmt_info);
7005   gcc_assert (reduc_info->is_reduc_info);
7006
7007   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7008       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7009     /* Leave the scalar phi in place.  */
7010     return true;
7011
7012   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7013   /* For a nested cycle we do not fill the above.  */
7014   if (!vectype_in)
7015     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7016   gcc_assert (vectype_in);
7017
7018   if (slp_node)
7019     {
7020       /* The size vect_schedule_slp_instance computes is off for us.  */
7021       vec_num = vect_get_num_vectors
7022           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7023            * SLP_TREE_SCALAR_STMTS (slp_node).length (), vectype_in);
7024       ncopies = 1;
7025     }
7026   else
7027     {
7028       vec_num = 1;
7029       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7030     }
7031
7032   /* Check whether we should use a single PHI node and accumulate
7033      vectors to one before the backedge.  */
7034   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7035     ncopies = 1;
7036
7037   /* Create the destination vector  */
7038   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7039   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7040                                                vectype_out);
7041
7042   /* Get the loop-entry arguments.  */
7043   tree vec_initial_def;
7044   auto_vec<tree> vec_initial_defs;
7045   if (slp_node)
7046     {
7047       vec_initial_defs.reserve (vec_num);
7048       gcc_assert (slp_node == slp_node_instance->reduc_phis);
7049       stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7050       tree neutral_op
7051         = neutral_op_for_slp_reduction (slp_node, vectype_out,
7052                                         STMT_VINFO_REDUC_CODE (reduc_info),
7053                                         first != NULL);
7054       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
7055                                       &vec_initial_defs, vec_num,
7056                                       first != NULL, neutral_op);
7057     }
7058   else
7059     {
7060       /* Get at the scalar def before the loop, that defines the initial
7061          value of the reduction variable.  */
7062       tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7063                                                 loop_preheader_edge (loop));
7064       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7065          and we can't use zero for induc_val, use initial_def.  Similarly
7066          for REDUC_MIN and initial_def larger than the base.  */
7067       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7068         {
7069           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7070           if (TREE_CODE (initial_def) == INTEGER_CST
7071               && !integer_zerop (induc_val)
7072               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7073                    && tree_int_cst_lt (initial_def, induc_val))
7074                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7075                       && tree_int_cst_lt (induc_val, initial_def))))
7076             {
7077               induc_val = initial_def;
7078               /* Communicate we used the initial_def to epilouge
7079                  generation.  */
7080               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7081             }
7082           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7083         }
7084       else if (nested_cycle)
7085         {
7086           /* Do not use an adjustment def as that case is not supported
7087              correctly if ncopies is not one.  */
7088           vec_initial_def = vect_get_vec_def_for_operand (initial_def,
7089                                                           reduc_stmt_info);
7090         }
7091       else
7092         {
7093           tree adjustment_def = NULL_TREE;
7094           tree *adjustment_defp = &adjustment_def;
7095           enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7096           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7097             adjustment_defp = NULL;
7098           vec_initial_def
7099             = get_initial_def_for_reduction (reduc_stmt_info, code,
7100                                              initial_def, adjustment_defp);
7101           STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7102         }
7103       vec_initial_defs.create (1);
7104       vec_initial_defs.quick_push (vec_initial_def);
7105     }
7106
7107   /* Generate the reduction PHIs upfront.  */
7108   prev_phi_info = NULL;
7109   for (i = 0; i < vec_num; i++)
7110     {
7111       tree vec_init_def = vec_initial_defs[i];
7112       for (j = 0; j < ncopies; j++)
7113         {
7114           /* Create the reduction-phi that defines the reduction
7115              operand.  */
7116           gphi *new_phi = create_phi_node (vec_dest, loop->header);
7117           stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7118
7119           /* Set the loop-entry arg of the reduction-phi.  */
7120           if (j != 0 && nested_cycle)
7121             vec_init_def = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7122                                                            vec_init_def);
7123           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7124                        UNKNOWN_LOCATION);
7125
7126           /* The loop-latch arg is set in epilogue processing.  */
7127
7128           if (slp_node)
7129             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
7130           else
7131             {
7132               if (j == 0)
7133                 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
7134               else
7135                 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
7136               prev_phi_info = new_phi_info;
7137             }
7138         }
7139     }
7140
7141   return true;
7142 }
7143
7144 /* Vectorizes LC PHIs.  */
7145
7146 bool
7147 vectorizable_lc_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
7148                      slp_tree slp_node)
7149 {
7150   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7151   if (!loop_vinfo
7152       || !is_a <gphi *> (stmt_info->stmt)
7153       || gimple_phi_num_args (stmt_info->stmt) != 1)
7154     return false;
7155
7156   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7157       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7158     return false;
7159
7160   if (!vec_stmt) /* transformation not required.  */
7161     {
7162       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7163       return true;
7164     }
7165
7166   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7167   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7168   basic_block bb = gimple_bb (stmt_info->stmt);
7169   edge e = single_pred_edge (bb);
7170   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7171   vec<tree> vec_oprnds = vNULL;
7172   vect_get_vec_defs (gimple_phi_arg_def (stmt_info->stmt, 0), NULL_TREE,
7173                      stmt_info, &vec_oprnds, NULL, slp_node);
7174   if (slp_node)
7175     {
7176       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7177       gcc_assert (vec_oprnds.length () == vec_num);
7178       for (unsigned i = 0; i < vec_num; i++)
7179         {
7180           /* Create the vectorized LC PHI node.  */
7181           gphi *new_phi = create_phi_node (vec_dest, bb);
7182           add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7183           stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7184           SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
7185         }
7186     }
7187   else
7188     {
7189       unsigned ncopies = vect_get_num_copies (loop_vinfo, vectype);
7190       stmt_vec_info prev_phi_info = NULL;
7191       for (unsigned i = 0; i < ncopies; i++)
7192         {
7193           if (i != 0)
7194             vect_get_vec_defs_for_stmt_copy (loop_vinfo, &vec_oprnds, NULL);
7195           /* Create the vectorized LC PHI node.  */
7196           gphi *new_phi = create_phi_node (vec_dest, bb);
7197           add_phi_arg (new_phi, vec_oprnds[0], e, UNKNOWN_LOCATION);
7198           stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7199           if (i == 0)
7200             STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
7201           else
7202             STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
7203           prev_phi_info = new_phi_info;
7204         }
7205     }
7206   vec_oprnds.release ();
7207
7208   return true;
7209 }
7210
7211
7212 /* Function vect_min_worthwhile_factor.
7213
7214    For a loop where we could vectorize the operation indicated by CODE,
7215    return the minimum vectorization factor that makes it worthwhile
7216    to use generic vectors.  */
7217 static unsigned int
7218 vect_min_worthwhile_factor (enum tree_code code)
7219 {
7220   switch (code)
7221     {
7222     case PLUS_EXPR:
7223     case MINUS_EXPR:
7224     case NEGATE_EXPR:
7225       return 4;
7226
7227     case BIT_AND_EXPR:
7228     case BIT_IOR_EXPR:
7229     case BIT_XOR_EXPR:
7230     case BIT_NOT_EXPR:
7231       return 2;
7232
7233     default:
7234       return INT_MAX;
7235     }
7236 }
7237
7238 /* Return true if VINFO indicates we are doing loop vectorization and if
7239    it is worth decomposing CODE operations into scalar operations for
7240    that loop's vectorization factor.  */
7241
7242 bool
7243 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7244 {
7245   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7246   unsigned HOST_WIDE_INT value;
7247   return (loop_vinfo
7248           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7249           && value >= vect_min_worthwhile_factor (code));
7250 }
7251
7252 /* Function vectorizable_induction
7253
7254    Check if STMT_INFO performs an induction computation that can be vectorized.
7255    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7256    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7257    Return true if STMT_INFO is vectorizable in this way.  */
7258
7259 bool
7260 vectorizable_induction (stmt_vec_info stmt_info,
7261                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7262                         stmt_vec_info *vec_stmt, slp_tree slp_node,
7263                         stmt_vector_for_cost *cost_vec)
7264 {
7265   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7266   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7267   unsigned ncopies;
7268   bool nested_in_vect_loop = false;
7269   class loop *iv_loop;
7270   tree vec_def;
7271   edge pe = loop_preheader_edge (loop);
7272   basic_block new_bb;
7273   tree new_vec, vec_init, vec_step, t;
7274   tree new_name;
7275   gimple *new_stmt;
7276   gphi *induction_phi;
7277   tree induc_def, vec_dest;
7278   tree init_expr, step_expr;
7279   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7280   unsigned i;
7281   tree expr;
7282   gimple_seq stmts;
7283   imm_use_iterator imm_iter;
7284   use_operand_p use_p;
7285   gimple *exit_phi;
7286   edge latch_e;
7287   tree loop_arg;
7288   gimple_stmt_iterator si;
7289
7290   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7291   if (!phi)
7292     return false;
7293
7294   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7295     return false;
7296
7297   /* Make sure it was recognized as induction computation.  */
7298   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7299     return false;
7300
7301   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7302   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7303
7304   if (slp_node)
7305     ncopies = 1;
7306   else
7307     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7308   gcc_assert (ncopies >= 1);
7309
7310   /* FORNOW. These restrictions should be relaxed.  */
7311   if (nested_in_vect_loop_p (loop, stmt_info))
7312     {
7313       imm_use_iterator imm_iter;
7314       use_operand_p use_p;
7315       gimple *exit_phi;
7316       edge latch_e;
7317       tree loop_arg;
7318
7319       if (ncopies > 1)
7320         {
7321           if (dump_enabled_p ())
7322             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7323                              "multiple types in nested loop.\n");
7324           return false;
7325         }
7326
7327       /* FORNOW: outer loop induction with SLP not supported.  */
7328       if (STMT_SLP_TYPE (stmt_info))
7329         return false;
7330
7331       exit_phi = NULL;
7332       latch_e = loop_latch_edge (loop->inner);
7333       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7334       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7335         {
7336           gimple *use_stmt = USE_STMT (use_p);
7337           if (is_gimple_debug (use_stmt))
7338             continue;
7339
7340           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7341             {
7342               exit_phi = use_stmt;
7343               break;
7344             }
7345         }
7346       if (exit_phi)
7347         {
7348           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7349           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7350                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7351             {
7352               if (dump_enabled_p ())
7353                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7354                                  "inner-loop induction only used outside "
7355                                  "of the outer vectorized loop.\n");
7356               return false;
7357             }
7358         }
7359
7360       nested_in_vect_loop = true;
7361       iv_loop = loop->inner;
7362     }
7363   else
7364     iv_loop = loop;
7365   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7366
7367   if (slp_node && !nunits.is_constant ())
7368     {
7369       /* The current SLP code creates the initial value element-by-element.  */
7370       if (dump_enabled_p ())
7371         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7372                          "SLP induction not supported for variable-length"
7373                          " vectors.\n");
7374       return false;
7375     }
7376
7377   if (!vec_stmt) /* transformation not required.  */
7378     {
7379       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7380       DUMP_VECT_SCOPE ("vectorizable_induction");
7381       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7382       return true;
7383     }
7384
7385   /* Transform.  */
7386
7387   /* Compute a vector variable, initialized with the first VF values of
7388      the induction variable.  E.g., for an iv with IV_PHI='X' and
7389      evolution S, for a vector of 4 units, we want to compute:
7390      [X, X + S, X + 2*S, X + 3*S].  */
7391
7392   if (dump_enabled_p ())
7393     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7394
7395   latch_e = loop_latch_edge (iv_loop);
7396   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7397
7398   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7399   gcc_assert (step_expr != NULL_TREE);
7400   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7401
7402   pe = loop_preheader_edge (iv_loop);
7403   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7404                                      loop_preheader_edge (iv_loop));
7405
7406   stmts = NULL;
7407   if (!nested_in_vect_loop)
7408     {
7409       /* Convert the initial value to the IV update type.  */
7410       tree new_type = TREE_TYPE (step_expr);
7411       init_expr = gimple_convert (&stmts, new_type, init_expr);
7412
7413       /* If we are using the loop mask to "peel" for alignment then we need
7414          to adjust the start value here.  */
7415       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7416       if (skip_niters != NULL_TREE)
7417         {
7418           if (FLOAT_TYPE_P (vectype))
7419             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7420                                         skip_niters);
7421           else
7422             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7423           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7424                                          skip_niters, step_expr);
7425           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7426                                     init_expr, skip_step);
7427         }
7428     }
7429
7430   if (stmts)
7431     {
7432       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7433       gcc_assert (!new_bb);
7434     }
7435
7436   /* Find the first insertion point in the BB.  */
7437   basic_block bb = gimple_bb (phi);
7438   si = gsi_after_labels (bb);
7439
7440   /* For SLP induction we have to generate several IVs as for example
7441      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7442      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7443      [VF*S, VF*S, VF*S, VF*S] for all.  */
7444   if (slp_node)
7445     {
7446       /* Enforced above.  */
7447       unsigned int const_nunits = nunits.to_constant ();
7448
7449       /* Generate [VF*S, VF*S, ... ].  */
7450       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7451         {
7452           expr = build_int_cst (integer_type_node, vf);
7453           expr = fold_convert (TREE_TYPE (step_expr), expr);
7454         }
7455       else
7456         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7457       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7458                               expr, step_expr);
7459       if (! CONSTANT_CLASS_P (new_name))
7460         new_name = vect_init_vector (stmt_info, new_name,
7461                                      TREE_TYPE (step_expr), NULL);
7462       new_vec = build_vector_from_val (step_vectype, new_name);
7463       vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7464
7465       /* Now generate the IVs.  */
7466       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7467       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7468       unsigned elts = const_nunits * nvects;
7469       unsigned nivs = least_common_multiple (group_size,
7470                                              const_nunits) / const_nunits;
7471       gcc_assert (elts % group_size == 0);
7472       tree elt = init_expr;
7473       unsigned ivn;
7474       for (ivn = 0; ivn < nivs; ++ivn)
7475         {
7476           tree_vector_builder elts (step_vectype, const_nunits, 1);
7477           stmts = NULL;
7478           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7479             {
7480               if (ivn*const_nunits + eltn >= group_size
7481                   && (ivn * const_nunits + eltn) % group_size == 0)
7482                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7483                                     elt, step_expr);
7484               elts.quick_push (elt);
7485             }
7486           vec_init = gimple_build_vector (&stmts, &elts);
7487           vec_init = gimple_convert (&stmts, vectype, vec_init);
7488           if (stmts)
7489             {
7490               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7491               gcc_assert (!new_bb);
7492             }
7493
7494           /* Create the induction-phi that defines the induction-operand.  */
7495           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7496           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7497           stmt_vec_info induction_phi_info
7498             = loop_vinfo->add_stmt (induction_phi);
7499           induc_def = PHI_RESULT (induction_phi);
7500
7501           /* Create the iv update inside the loop  */
7502           gimple_seq stmts = NULL;
7503           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7504           vec_def = gimple_build (&stmts,
7505                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
7506           vec_def = gimple_convert (&stmts, vectype, vec_def);
7507           loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (vec_def));
7508           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7509
7510           /* Set the arguments of the phi node:  */
7511           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7512           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7513                        UNKNOWN_LOCATION);
7514
7515           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7516         }
7517
7518       /* Re-use IVs when we can.  */
7519       if (ivn < nvects)
7520         {
7521           unsigned vfp
7522             = least_common_multiple (group_size, const_nunits) / group_size;
7523           /* Generate [VF'*S, VF'*S, ... ].  */
7524           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7525             {
7526               expr = build_int_cst (integer_type_node, vfp);
7527               expr = fold_convert (TREE_TYPE (step_expr), expr);
7528             }
7529           else
7530             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7531           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7532                                   expr, step_expr);
7533           if (! CONSTANT_CLASS_P (new_name))
7534             new_name = vect_init_vector (stmt_info, new_name,
7535                                          TREE_TYPE (step_expr), NULL);
7536           new_vec = build_vector_from_val (step_vectype, new_name);
7537           vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7538           for (; ivn < nvects; ++ivn)
7539             {
7540               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7541               tree def;
7542               if (gimple_code (iv) == GIMPLE_PHI)
7543                 def = gimple_phi_result (iv);
7544               else
7545                 def = gimple_assign_lhs (iv);
7546               gimple_seq stmts = NULL;
7547               def = gimple_convert (&stmts, step_vectype, def);
7548               def = gimple_build (&stmts,
7549                                   PLUS_EXPR, step_vectype, def, vec_step);
7550               def = gimple_convert (&stmts, vectype, def);
7551               if (gimple_code (iv) == GIMPLE_PHI)
7552                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7553               else
7554                 {
7555                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7556                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
7557                 }
7558               SLP_TREE_VEC_STMTS (slp_node).quick_push
7559                 (loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (def)));
7560             }
7561         }
7562
7563       return true;
7564     }
7565
7566   /* Create the vector that holds the initial_value of the induction.  */
7567   if (nested_in_vect_loop)
7568     {
7569       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7570          been created during vectorization of previous stmts.  We obtain it
7571          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7572       vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7573       /* If the initial value is not of proper type, convert it.  */
7574       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7575         {
7576           new_stmt
7577             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7578                                                           vect_simple_var,
7579                                                           "vec_iv_"),
7580                                    VIEW_CONVERT_EXPR,
7581                                    build1 (VIEW_CONVERT_EXPR, vectype,
7582                                            vec_init));
7583           vec_init = gimple_assign_lhs (new_stmt);
7584           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7585                                                  new_stmt);
7586           gcc_assert (!new_bb);
7587           loop_vinfo->add_stmt (new_stmt);
7588         }
7589     }
7590   else
7591     {
7592       /* iv_loop is the loop to be vectorized. Create:
7593          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7594       stmts = NULL;
7595       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
7596
7597       unsigned HOST_WIDE_INT const_nunits;
7598       if (nunits.is_constant (&const_nunits))
7599         {
7600           tree_vector_builder elts (step_vectype, const_nunits, 1);
7601           elts.quick_push (new_name);
7602           for (i = 1; i < const_nunits; i++)
7603             {
7604               /* Create: new_name_i = new_name + step_expr  */
7605               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7606                                        new_name, step_expr);
7607               elts.quick_push (new_name);
7608             }
7609           /* Create a vector from [new_name_0, new_name_1, ...,
7610              new_name_nunits-1]  */
7611           vec_init = gimple_build_vector (&stmts, &elts);
7612         }
7613       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7614         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7615         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
7616                                  new_name, step_expr);
7617       else
7618         {
7619           /* Build:
7620                 [base, base, base, ...]
7621                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7622           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7623           gcc_assert (flag_associative_math);
7624           tree index = build_index_vector (step_vectype, 0, 1);
7625           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7626                                                         new_name);
7627           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7628                                                         step_expr);
7629           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
7630           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
7631                                    vec_init, step_vec);
7632           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
7633                                    vec_init, base_vec);
7634         }
7635       vec_init = gimple_convert (&stmts, vectype, vec_init);
7636
7637       if (stmts)
7638         {
7639           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7640           gcc_assert (!new_bb);
7641         }
7642     }
7643
7644
7645   /* Create the vector that holds the step of the induction.  */
7646   if (nested_in_vect_loop)
7647     /* iv_loop is nested in the loop to be vectorized. Generate:
7648        vec_step = [S, S, S, S]  */
7649     new_name = step_expr;
7650   else
7651     {
7652       /* iv_loop is the loop to be vectorized. Generate:
7653           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7654       gimple_seq seq = NULL;
7655       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7656         {
7657           expr = build_int_cst (integer_type_node, vf);
7658           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7659         }
7660       else
7661         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7662       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7663                                expr, step_expr);
7664       if (seq)
7665         {
7666           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7667           gcc_assert (!new_bb);
7668         }
7669     }
7670
7671   t = unshare_expr (new_name);
7672   gcc_assert (CONSTANT_CLASS_P (new_name)
7673               || TREE_CODE (new_name) == SSA_NAME);
7674   new_vec = build_vector_from_val (step_vectype, t);
7675   vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7676
7677
7678   /* Create the following def-use cycle:
7679      loop prolog:
7680          vec_init = ...
7681          vec_step = ...
7682      loop:
7683          vec_iv = PHI <vec_init, vec_loop>
7684          ...
7685          STMT
7686          ...
7687          vec_loop = vec_iv + vec_step;  */
7688
7689   /* Create the induction-phi that defines the induction-operand.  */
7690   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7691   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7692   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7693   induc_def = PHI_RESULT (induction_phi);
7694
7695   /* Create the iv update inside the loop  */
7696   stmts = NULL;
7697   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7698   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
7699   vec_def = gimple_convert (&stmts, vectype, vec_def);
7700   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7701   new_stmt = SSA_NAME_DEF_STMT (vec_def);
7702   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7703
7704   /* Set the arguments of the phi node:  */
7705   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7706   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7707                UNKNOWN_LOCATION);
7708
7709   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7710
7711   /* In case that vectorization factor (VF) is bigger than the number
7712      of elements that we can fit in a vectype (nunits), we have to generate
7713      more than one vector stmt - i.e - we need to "unroll" the
7714      vector stmt by a factor VF/nunits.  For more details see documentation
7715      in vectorizable_operation.  */
7716
7717   if (ncopies > 1)
7718     {
7719       gimple_seq seq = NULL;
7720       stmt_vec_info prev_stmt_vinfo;
7721       /* FORNOW. This restriction should be relaxed.  */
7722       gcc_assert (!nested_in_vect_loop);
7723
7724       /* Create the vector that holds the step of the induction.  */
7725       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7726         {
7727           expr = build_int_cst (integer_type_node, nunits);
7728           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7729         }
7730       else
7731         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7732       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7733                                expr, step_expr);
7734       if (seq)
7735         {
7736           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7737           gcc_assert (!new_bb);
7738         }
7739
7740       t = unshare_expr (new_name);
7741       gcc_assert (CONSTANT_CLASS_P (new_name)
7742                   || TREE_CODE (new_name) == SSA_NAME);
7743       new_vec = build_vector_from_val (step_vectype, t);
7744       vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7745
7746       vec_def = induc_def;
7747       prev_stmt_vinfo = induction_phi_info;
7748       for (i = 1; i < ncopies; i++)
7749         {
7750           /* vec_i = vec_prev + vec_step  */
7751           gimple_seq stmts = NULL;
7752           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
7753           vec_def = gimple_build (&stmts,
7754                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
7755           vec_def = gimple_convert (&stmts, vectype, vec_def);
7756
7757           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7758           new_stmt = SSA_NAME_DEF_STMT (vec_def);
7759           new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7760           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7761           prev_stmt_vinfo = new_stmt_info;
7762         }
7763     }
7764
7765   if (nested_in_vect_loop)
7766     {
7767       /* Find the loop-closed exit-phi of the induction, and record
7768          the final vector of induction results:  */
7769       exit_phi = NULL;
7770       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7771         {
7772           gimple *use_stmt = USE_STMT (use_p);
7773           if (is_gimple_debug (use_stmt))
7774             continue;
7775
7776           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7777             {
7778               exit_phi = use_stmt;
7779               break;
7780             }
7781         }
7782       if (exit_phi)
7783         {
7784           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7785           /* FORNOW. Currently not supporting the case that an inner-loop induction
7786              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7787           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7788                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7789
7790           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7791           if (dump_enabled_p ())
7792             dump_printf_loc (MSG_NOTE, vect_location,
7793                              "vector of inductions after inner-loop:%G",
7794                              new_stmt);
7795         }
7796     }
7797
7798
7799   if (dump_enabled_p ())
7800     dump_printf_loc (MSG_NOTE, vect_location,
7801                      "transform induction: created def-use cycle: %G%G",
7802                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
7803
7804   return true;
7805 }
7806
7807 /* Function vectorizable_live_operation.
7808
7809    STMT_INFO computes a value that is used outside the loop.  Check if
7810    it can be supported.  */
7811
7812 bool
7813 vectorizable_live_operation (stmt_vec_info stmt_info,
7814                              gimple_stmt_iterator *gsi,
7815                              slp_tree slp_node, slp_instance slp_node_instance,
7816                              int slp_index, bool vec_stmt_p,
7817                              stmt_vector_for_cost *)
7818 {
7819   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7820   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7821   imm_use_iterator imm_iter;
7822   tree lhs, lhs_type, bitsize, vec_bitsize;
7823   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7824   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7825   int ncopies;
7826   gimple *use_stmt;
7827   auto_vec<tree> vec_oprnds;
7828   int vec_entry = 0;
7829   poly_uint64 vec_index = 0;
7830
7831   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7832
7833   /* If a stmt of a reduction is live, vectorize it via
7834      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
7835      validity so just trigger the transform here.  */
7836   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
7837     {
7838       if (!vec_stmt_p)
7839         return true;
7840       if (slp_node)
7841         {
7842           /* For reduction chains the meta-info is attached to
7843              the group leader.  */
7844           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7845             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7846           /* For SLP reductions we vectorize the epilogue for
7847              all involved stmts together.  */
7848           else if (slp_index != 0)
7849             return true;
7850         }
7851       stmt_vec_info reduc_info = info_for_reduction (stmt_info);
7852       gcc_assert (reduc_info->is_reduc_info);
7853       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
7854           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
7855         return true;
7856       vect_create_epilog_for_reduction (stmt_info, slp_node,
7857                                         slp_node_instance);
7858       return true;
7859     }
7860
7861   /* FORNOW.  CHECKME.  */
7862   if (nested_in_vect_loop_p (loop, stmt_info))
7863     return false;
7864
7865   /* If STMT is not relevant and it is a simple assignment and its inputs are
7866      invariant then it can remain in place, unvectorized.  The original last
7867      scalar value that it computes will be used.  */
7868   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7869     {
7870       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7871       if (dump_enabled_p ())
7872         dump_printf_loc (MSG_NOTE, vect_location,
7873                          "statement is simple and uses invariant.  Leaving in "
7874                          "place.\n");
7875       return true;
7876     }
7877
7878   if (slp_node)
7879     ncopies = 1;
7880   else
7881     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7882
7883   if (slp_node)
7884     {
7885       gcc_assert (slp_index >= 0);
7886
7887       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7888       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7889
7890       /* Get the last occurrence of the scalar index from the concatenation of
7891          all the slp vectors. Calculate which slp vector it is and the index
7892          within.  */
7893       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7894
7895       /* Calculate which vector contains the result, and which lane of
7896          that vector we need.  */
7897       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7898         {
7899           if (dump_enabled_p ())
7900             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7901                              "Cannot determine which vector holds the"
7902                              " final result.\n");
7903           return false;
7904         }
7905     }
7906
7907   if (!vec_stmt_p)
7908     {
7909       /* No transformation required.  */
7910       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7911         {
7912           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7913                                                OPTIMIZE_FOR_SPEED))
7914             {
7915               if (dump_enabled_p ())
7916                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7917                                  "can't use a fully-masked loop because "
7918                                  "the target doesn't support extract last "
7919                                  "reduction.\n");
7920               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7921             }
7922           else if (slp_node)
7923             {
7924               if (dump_enabled_p ())
7925                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7926                                  "can't use a fully-masked loop because an "
7927                                  "SLP statement is live after the loop.\n");
7928               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7929             }
7930           else if (ncopies > 1)
7931             {
7932               if (dump_enabled_p ())
7933                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7934                                  "can't use a fully-masked loop because"
7935                                  " ncopies is greater than 1.\n");
7936               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7937             }
7938           else
7939             {
7940               gcc_assert (ncopies == 1 && !slp_node);
7941               vect_record_loop_mask (loop_vinfo,
7942                                      &LOOP_VINFO_MASKS (loop_vinfo),
7943                                      1, vectype, NULL);
7944             }
7945         }
7946       return true;
7947     }
7948
7949   /* Use the lhs of the original scalar statement.  */
7950   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7951
7952   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7953         : gimple_get_lhs (stmt);
7954   lhs_type = TREE_TYPE (lhs);
7955
7956   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7957              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7958              : TYPE_SIZE (TREE_TYPE (vectype)));
7959   vec_bitsize = TYPE_SIZE (vectype);
7960
7961   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7962   tree vec_lhs, bitstart;
7963   if (slp_node)
7964     {
7965       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7966
7967       /* Get the correct slp vectorized stmt.  */
7968       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7969       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7970         vec_lhs = gimple_phi_result (phi);
7971       else
7972         vec_lhs = gimple_get_lhs (vec_stmt);
7973
7974       /* Get entry to use.  */
7975       bitstart = bitsize_int (vec_index);
7976       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7977     }
7978   else
7979     {
7980       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7981       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7982       gcc_checking_assert (ncopies == 1
7983                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7984
7985       /* For multiple copies, get the last copy.  */
7986       for (int i = 1; i < ncopies; ++i)
7987         vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
7988
7989       /* Get the last lane in the vector.  */
7990       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7991     }
7992
7993   gimple_seq stmts = NULL;
7994   tree new_tree;
7995   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7996     {
7997       /* Emit:
7998
7999            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8000
8001          where VEC_LHS is the vectorized live-out result and MASK is
8002          the loop mask for the final iteration.  */
8003       gcc_assert (ncopies == 1 && !slp_node);
8004       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8005       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8006                                       1, vectype, 0);
8007       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
8008                                       scalar_type, mask, vec_lhs);
8009
8010       /* Convert the extracted vector element to the required scalar type.  */
8011       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8012     }
8013   else
8014     {
8015       tree bftype = TREE_TYPE (vectype);
8016       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8017         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8018       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8019       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8020                                        &stmts, true, NULL_TREE);
8021     }
8022
8023   if (stmts)
8024     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8025
8026   /* Replace use of lhs with newly computed result.  If the use stmt is a
8027      single arg PHI, just replace all uses of PHI result.  It's necessary
8028      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8029   use_operand_p use_p;
8030   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8031     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8032         && !is_gimple_debug (use_stmt))
8033     {
8034       if (gimple_code (use_stmt) == GIMPLE_PHI
8035           && gimple_phi_num_args (use_stmt) == 1)
8036         {
8037           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8038         }
8039       else
8040         {
8041           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8042             SET_USE (use_p, new_tree);
8043         }
8044       update_stmt (use_stmt);
8045     }
8046
8047   return true;
8048 }
8049
8050 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8051
8052 static void
8053 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8054 {
8055   ssa_op_iter op_iter;
8056   imm_use_iterator imm_iter;
8057   def_operand_p def_p;
8058   gimple *ustmt;
8059
8060   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8061     {
8062       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8063         {
8064           basic_block bb;
8065
8066           if (!is_gimple_debug (ustmt))
8067             continue;
8068
8069           bb = gimple_bb (ustmt);
8070
8071           if (!flow_bb_inside_loop_p (loop, bb))
8072             {
8073               if (gimple_debug_bind_p (ustmt))
8074                 {
8075                   if (dump_enabled_p ())
8076                     dump_printf_loc (MSG_NOTE, vect_location,
8077                                      "killing debug use\n");
8078
8079                   gimple_debug_bind_reset_value (ustmt);
8080                   update_stmt (ustmt);
8081                 }
8082               else
8083                 gcc_unreachable ();
8084             }
8085         }
8086     }
8087 }
8088
8089 /* Given loop represented by LOOP_VINFO, return true if computation of
8090    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8091    otherwise.  */
8092
8093 static bool
8094 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8095 {
8096   /* Constant case.  */
8097   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8098     {
8099       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8100       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8101
8102       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8103       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8104       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8105         return true;
8106     }
8107
8108   widest_int max;
8109   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8110   /* Check the upper bound of loop niters.  */
8111   if (get_max_loop_iterations (loop, &max))
8112     {
8113       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8114       signop sgn = TYPE_SIGN (type);
8115       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8116       if (max < type_max)
8117         return true;
8118     }
8119   return false;
8120 }
8121
8122 /* Return a mask type with half the number of elements as OLD_TYPE,
8123    given that it should have mode NEW_MODE.  */
8124
8125 tree
8126 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8127 {
8128   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8129   return build_truth_vector_type_for_mode (nunits, new_mode);
8130 }
8131
8132 /* Return a mask type with twice as many elements as OLD_TYPE,
8133    given that it should have mode NEW_MODE.  */
8134
8135 tree
8136 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8137 {
8138   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8139   return build_truth_vector_type_for_mode (nunits, new_mode);
8140 }
8141
8142 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8143    contain a sequence of NVECTORS masks that each control a vector of type
8144    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
8145    these vector masks with the vector version of SCALAR_MASK.  */
8146
8147 void
8148 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8149                        unsigned int nvectors, tree vectype, tree scalar_mask)
8150 {
8151   gcc_assert (nvectors != 0);
8152   if (masks->length () < nvectors)
8153     masks->safe_grow_cleared (nvectors);
8154   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8155   /* The number of scalars per iteration and the number of vectors are
8156      both compile-time constants.  */
8157   unsigned int nscalars_per_iter
8158     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8159                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8160
8161   if (scalar_mask)
8162     {
8163       scalar_cond_masked_key cond (scalar_mask, nvectors);
8164       loop_vinfo->scalar_cond_masked_set.add (cond);
8165     }
8166
8167   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8168     {
8169       rgm->max_nscalars_per_iter = nscalars_per_iter;
8170       rgm->mask_type = truth_type_for (vectype);
8171     }
8172 }
8173
8174 /* Given a complete set of masks MASKS, extract mask number INDEX
8175    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8176    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8177
8178    See the comment above vec_loop_masks for more details about the mask
8179    arrangement.  */
8180
8181 tree
8182 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8183                     unsigned int nvectors, tree vectype, unsigned int index)
8184 {
8185   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8186   tree mask_type = rgm->mask_type;
8187
8188   /* Populate the rgroup's mask array, if this is the first time we've
8189      used it.  */
8190   if (rgm->masks.is_empty ())
8191     {
8192       rgm->masks.safe_grow_cleared (nvectors);
8193       for (unsigned int i = 0; i < nvectors; ++i)
8194         {
8195           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8196           /* Provide a dummy definition until the real one is available.  */
8197           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8198           rgm->masks[i] = mask;
8199         }
8200     }
8201
8202   tree mask = rgm->masks[index];
8203   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8204                 TYPE_VECTOR_SUBPARTS (vectype)))
8205     {
8206       /* A loop mask for data type X can be reused for data type Y
8207          if X has N times more elements than Y and if Y's elements
8208          are N times bigger than X's.  In this case each sequence
8209          of N elements in the loop mask will be all-zero or all-one.
8210          We can then view-convert the mask so that each sequence of
8211          N elements is replaced by a single element.  */
8212       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8213                               TYPE_VECTOR_SUBPARTS (vectype)));
8214       gimple_seq seq = NULL;
8215       mask_type = truth_type_for (vectype);
8216       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8217       if (seq)
8218         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8219     }
8220   return mask;
8221 }
8222
8223 /* Scale profiling counters by estimation for LOOP which is vectorized
8224    by factor VF.  */
8225
8226 static void
8227 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
8228 {
8229   edge preheader = loop_preheader_edge (loop);
8230   /* Reduce loop iterations by the vectorization factor.  */
8231   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8232   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8233
8234   if (freq_h.nonzero_p ())
8235     {
8236       profile_probability p;
8237
8238       /* Avoid dropping loop body profile counter to 0 because of zero count
8239          in loop's preheader.  */
8240       if (!(freq_e == profile_count::zero ()))
8241         freq_e = freq_e.force_nonzero ();
8242       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8243       scale_loop_frequencies (loop, p);
8244     }
8245
8246   edge exit_e = single_exit (loop);
8247   exit_e->probability = profile_probability::always ()
8248                                  .apply_scale (1, new_est_niter + 1);
8249
8250   edge exit_l = single_pred_edge (loop->latch);
8251   profile_probability prob = exit_l->probability;
8252   exit_l->probability = exit_e->probability.invert ();
8253   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8254     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8255 }
8256
8257 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8258    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8259    stmt_vec_info.  */
8260
8261 static void
8262 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8263                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8264 {
8265   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8266   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8267
8268   if (dump_enabled_p ())
8269     dump_printf_loc (MSG_NOTE, vect_location,
8270                      "------>vectorizing statement: %G", stmt_info->stmt);
8271
8272   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8273     vect_loop_kill_debug_uses (loop, stmt_info);
8274
8275   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8276       && !STMT_VINFO_LIVE_P (stmt_info))
8277     return;
8278
8279   if (STMT_VINFO_VECTYPE (stmt_info))
8280     {
8281       poly_uint64 nunits
8282         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8283       if (!STMT_SLP_TYPE (stmt_info)
8284           && maybe_ne (nunits, vf)
8285           && dump_enabled_p ())
8286         /* For SLP VF is set according to unrolling factor, and not
8287            to vector size, hence for SLP this print is not valid.  */
8288         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8289     }
8290
8291   /* Pure SLP statements have already been vectorized.  We still need
8292      to apply loop vectorization to hybrid SLP statements.  */
8293   if (PURE_SLP_STMT (stmt_info))
8294     return;
8295
8296   if (dump_enabled_p ())
8297     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8298
8299   if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8300     *seen_store = stmt_info;
8301 }
8302
8303 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
8304    in the hash_map with its corresponding values.  */
8305
8306 static tree
8307 find_in_mapping (tree t, void *context)
8308 {
8309   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
8310
8311   tree *value = mapping->get (t);
8312   return value ? *value : t;
8313 }
8314
8315 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
8316    original loop that has now been vectorized.
8317
8318    The inits of the data_references need to be advanced with the number of
8319    iterations of the main loop.  This has been computed in vect_do_peeling and
8320    is stored in parameter ADVANCE.  We first restore the data_references
8321    initial offset with the values recored in ORIG_DRS_INIT.
8322
8323    Since the loop_vec_info of this EPILOGUE was constructed for the original
8324    loop, its stmt_vec_infos all point to the original statements.  These need
8325    to be updated to point to their corresponding copies as well as the SSA_NAMES
8326    in their PATTERN_DEF_SEQs and RELATED_STMTs.
8327
8328    The data_reference's connections also need to be updated.  Their
8329    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
8330    stmt_vec_infos, their statements need to point to their corresponding copy,
8331    if they are gather loads or scatter stores then their reference needs to be
8332    updated to point to its corresponding copy and finally we set
8333    'base_misaligned' to false as we have already peeled for alignment in the
8334    prologue of the main loop.  */
8335
8336 static void
8337 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
8338 {
8339   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
8340   auto_vec<gimple *> stmt_worklist;
8341   hash_map<tree,tree> mapping;
8342   gimple *orig_stmt, *new_stmt;
8343   gimple_stmt_iterator epilogue_gsi;
8344   gphi_iterator epilogue_phi_gsi;
8345   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
8346   basic_block *epilogue_bbs = get_loop_body (epilogue);
8347   unsigned i;
8348
8349   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
8350
8351   /* Advance data_reference's with the number of iterations of the previous
8352      loop and its prologue.  */
8353   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
8354
8355
8356   /* The EPILOGUE loop is a copy of the original loop so they share the same
8357      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
8358      point to the copied statements.  We also create a mapping of all LHS' in
8359      the original loop and all the LHS' in the EPILOGUE and create worklists to
8360      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
8361   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
8362     {
8363       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
8364            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
8365         {
8366           new_stmt = epilogue_phi_gsi.phi ();
8367
8368           gcc_assert (gimple_uid (new_stmt) > 0);
8369           stmt_vinfo
8370             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8371
8372           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8373           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8374
8375           mapping.put (gimple_phi_result (orig_stmt),
8376                        gimple_phi_result (new_stmt));
8377           /* PHI nodes can not have patterns or related statements.  */
8378           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
8379                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
8380         }
8381
8382       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
8383            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
8384         {
8385           new_stmt = gsi_stmt (epilogue_gsi);
8386
8387           gcc_assert (gimple_uid (new_stmt) > 0);
8388           stmt_vinfo
8389             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8390
8391           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8392           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8393
8394           if (tree old_lhs = gimple_get_lhs (orig_stmt))
8395             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
8396
8397           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
8398             {
8399               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
8400               for (gimple_stmt_iterator gsi = gsi_start (seq);
8401                    !gsi_end_p (gsi); gsi_next (&gsi))
8402                 stmt_worklist.safe_push (gsi_stmt (gsi));
8403             }
8404
8405           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
8406           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
8407             {
8408               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
8409               stmt_worklist.safe_push (stmt);
8410               /* Set BB such that the assert in
8411                 'get_initial_def_for_reduction' is able to determine that
8412                 the BB of the related stmt is inside this loop.  */
8413               gimple_set_bb (stmt,
8414                              gimple_bb (new_stmt));
8415               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
8416               gcc_assert (related_vinfo == NULL
8417                           || related_vinfo == stmt_vinfo);
8418             }
8419         }
8420     }
8421
8422   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
8423      using the original main loop and thus need to be updated to refer to the
8424      cloned variables used in the epilogue.  */
8425   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
8426     {
8427       gimple *stmt = stmt_worklist[i];
8428       tree *new_op;
8429
8430       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
8431         {
8432           tree op = gimple_op (stmt, j);
8433           if ((new_op = mapping.get(op)))
8434             gimple_set_op (stmt, j, *new_op);
8435           else
8436             {
8437               /* PR92429: The last argument of simplify_replace_tree disables
8438                  folding when replacing arguments.  This is required as
8439                  otherwise you might end up with different statements than the
8440                  ones analyzed in vect_loop_analyze, leading to different
8441                  vectorization.  */
8442               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
8443                                           &find_in_mapping, &mapping, false);
8444               gimple_set_op (stmt, j, op);
8445             }
8446         }
8447     }
8448
8449   struct data_reference *dr;
8450   vec<data_reference_p> datarefs = epilogue_vinfo->shared->datarefs;
8451   FOR_EACH_VEC_ELT (datarefs, i, dr)
8452     {
8453       orig_stmt = DR_STMT (dr);
8454       gcc_assert (gimple_uid (orig_stmt) > 0);
8455       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
8456       /* Data references for gather loads and scatter stores do not use the
8457          updated offset we set using ADVANCE.  Instead we have to make sure the
8458          reference in the data references point to the corresponding copy of
8459          the original in the epilogue.  */
8460       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
8461           == VMAT_GATHER_SCATTER)
8462         {
8463           DR_REF (dr)
8464             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
8465                                      &find_in_mapping, &mapping);
8466           DR_BASE_ADDRESS (dr)
8467             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
8468                                      &find_in_mapping, &mapping);
8469         }
8470       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
8471       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
8472       /* The vector size of the epilogue is smaller than that of the main loop
8473          so the alignment is either the same or lower. This means the dr will
8474          thus by definition be aligned.  */
8475       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
8476     }
8477
8478   epilogue_vinfo->shared->datarefs_copy.release ();
8479   epilogue_vinfo->shared->save_datarefs ();
8480 }
8481
8482 /* Function vect_transform_loop.
8483
8484    The analysis phase has determined that the loop is vectorizable.
8485    Vectorize the loop - created vectorized stmts to replace the scalar
8486    stmts in the loop, and update the loop exit condition.
8487    Returns scalar epilogue loop if any.  */
8488
8489 class loop *
8490 vect_transform_loop (loop_vec_info loop_vinfo)
8491 {
8492   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8493   class loop *epilogue = NULL;
8494   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8495   int nbbs = loop->num_nodes;
8496   int i;
8497   tree niters_vector = NULL_TREE;
8498   tree step_vector = NULL_TREE;
8499   tree niters_vector_mult_vf = NULL_TREE;
8500   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8501   unsigned int lowest_vf = constant_lower_bound (vf);
8502   gimple *stmt;
8503   bool check_profitability = false;
8504   unsigned int th;
8505
8506   DUMP_VECT_SCOPE ("vec_transform_loop");
8507
8508   loop_vinfo->shared->check_datarefs ();
8509
8510   /* Use the more conservative vectorization threshold.  If the number
8511      of iterations is constant assume the cost check has been performed
8512      by our caller.  If the threshold makes all loops profitable that
8513      run at least the (estimated) vectorization factor number of times
8514      checking is pointless, too.  */
8515   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8516   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
8517     {
8518       if (dump_enabled_p ())
8519         dump_printf_loc (MSG_NOTE, vect_location,
8520                          "Profitability threshold is %d loop iterations.\n",
8521                          th);
8522       check_profitability = true;
8523     }
8524
8525   /* Make sure there exists a single-predecessor exit bb.  Do this before
8526      versioning.   */
8527   edge e = single_exit (loop);
8528   if (! single_pred_p (e->dest))
8529     {
8530       split_loop_exit_edge (e, true);
8531       if (dump_enabled_p ())
8532         dump_printf (MSG_NOTE, "split exit edge\n");
8533     }
8534
8535   /* Version the loop first, if required, so the profitability check
8536      comes first.  */
8537
8538   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8539     {
8540       class loop *sloop
8541         = vect_loop_versioning (loop_vinfo);
8542       sloop->force_vectorize = false;
8543       check_profitability = false;
8544     }
8545
8546   /* Make sure there exists a single-predecessor exit bb also on the
8547      scalar loop copy.  Do this after versioning but before peeling
8548      so CFG structure is fine for both scalar and if-converted loop
8549      to make slpeel_duplicate_current_defs_from_edges face matched
8550      loop closed PHI nodes on the exit.  */
8551   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8552     {
8553       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8554       if (! single_pred_p (e->dest))
8555         {
8556           split_loop_exit_edge (e, true);
8557           if (dump_enabled_p ())
8558             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8559         }
8560     }
8561
8562   tree niters = vect_build_loop_niters (loop_vinfo);
8563   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8564   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8565   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8566   tree advance;
8567   drs_init_vec orig_drs_init;
8568
8569   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8570                               &step_vector, &niters_vector_mult_vf, th,
8571                               check_profitability, niters_no_overflow,
8572                               &advance);
8573
8574   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
8575       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
8576     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
8577                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
8578
8579   if (niters_vector == NULL_TREE)
8580     {
8581       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8582           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8583           && known_eq (lowest_vf, vf))
8584         {
8585           niters_vector
8586             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8587                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8588           step_vector = build_one_cst (TREE_TYPE (niters));
8589         }
8590       else
8591         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8592                                      &step_vector, niters_no_overflow);
8593     }
8594
8595   /* 1) Make sure the loop header has exactly two entries
8596      2) Make sure we have a preheader basic block.  */
8597
8598   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8599
8600   split_edge (loop_preheader_edge (loop));
8601
8602   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8603       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8604     /* This will deal with any possible peeling.  */
8605     vect_prepare_for_masked_peels (loop_vinfo);
8606
8607   /* Schedule the SLP instances first, then handle loop vectorization
8608      below.  */
8609   if (!loop_vinfo->slp_instances.is_empty ())
8610     {
8611       DUMP_VECT_SCOPE ("scheduling SLP instances");
8612       vect_schedule_slp (loop_vinfo);
8613     }
8614
8615   /* FORNOW: the vectorizer supports only loops which body consist
8616      of one basic block (header + empty latch). When the vectorizer will
8617      support more involved loop forms, the order by which the BBs are
8618      traversed need to be reconsidered.  */
8619
8620   for (i = 0; i < nbbs; i++)
8621     {
8622       basic_block bb = bbs[i];
8623       stmt_vec_info stmt_info;
8624
8625       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8626            gsi_next (&si))
8627         {
8628           gphi *phi = si.phi ();
8629           if (dump_enabled_p ())
8630             dump_printf_loc (MSG_NOTE, vect_location,
8631                              "------>vectorizing phi: %G", phi);
8632           stmt_info = loop_vinfo->lookup_stmt (phi);
8633           if (!stmt_info)
8634             continue;
8635
8636           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8637             vect_loop_kill_debug_uses (loop, stmt_info);
8638
8639           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8640               && !STMT_VINFO_LIVE_P (stmt_info))
8641             continue;
8642
8643           if (STMT_VINFO_VECTYPE (stmt_info)
8644               && (maybe_ne
8645                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8646               && dump_enabled_p ())
8647             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8648
8649           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8650                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8651                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
8652                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
8653                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
8654               && ! PURE_SLP_STMT (stmt_info))
8655             {
8656               if (dump_enabled_p ())
8657                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8658               vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8659             }
8660         }
8661
8662       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8663            !gsi_end_p (si);)
8664         {
8665           stmt = gsi_stmt (si);
8666           /* During vectorization remove existing clobber stmts.  */
8667           if (gimple_clobber_p (stmt))
8668             {
8669               unlink_stmt_vdef (stmt);
8670               gsi_remove (&si, true);
8671               release_defs (stmt);
8672             }
8673           else
8674             {
8675               stmt_info = loop_vinfo->lookup_stmt (stmt);
8676
8677               /* vector stmts created in the outer-loop during vectorization of
8678                  stmts in an inner-loop may not have a stmt_info, and do not
8679                  need to be vectorized.  */
8680               stmt_vec_info seen_store = NULL;
8681               if (stmt_info)
8682                 {
8683                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8684                     {
8685                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8686                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8687                            !gsi_end_p (subsi); gsi_next (&subsi))
8688                         {
8689                           stmt_vec_info pat_stmt_info
8690                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8691                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8692                                                     &si, &seen_store);
8693                         }
8694                       stmt_vec_info pat_stmt_info
8695                         = STMT_VINFO_RELATED_STMT (stmt_info);
8696                       vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8697                                                 &seen_store);
8698                     }
8699                   vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8700                                             &seen_store);
8701                 }
8702               gsi_next (&si);
8703               if (seen_store)
8704                 {
8705                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8706                     /* Interleaving.  If IS_STORE is TRUE, the
8707                        vectorization of the interleaving chain was
8708                        completed - free all the stores in the chain.  */
8709                     vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8710                   else
8711                     /* Free the attached stmt_vec_info and remove the stmt.  */
8712                     loop_vinfo->remove_stmt (stmt_info);
8713                 }
8714             }
8715         }
8716
8717       /* Stub out scalar statements that must not survive vectorization.
8718          Doing this here helps with grouped statements, or statements that
8719          are involved in patterns.  */
8720       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8721            !gsi_end_p (gsi); gsi_next (&gsi))
8722         {
8723           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8724           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8725             {
8726               tree lhs = gimple_get_lhs (call);
8727               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8728                 {
8729                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8730                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8731                   gsi_replace (&gsi, new_stmt, true);
8732                 }
8733             }
8734         }
8735     }                           /* BBs in loop */
8736
8737   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8738      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8739   if (integer_onep (step_vector))
8740     niters_no_overflow = true;
8741   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8742                            niters_vector_mult_vf, !niters_no_overflow);
8743
8744   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8745   scale_profile_for_vect_loop (loop, assumed_vf);
8746
8747   /* True if the final iteration might not handle a full vector's
8748      worth of scalar iterations.  */
8749   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8750   /* The minimum number of iterations performed by the epilogue.  This
8751      is 1 when peeling for gaps because we always need a final scalar
8752      iteration.  */
8753   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8754   /* +1 to convert latch counts to loop iteration counts,
8755      -min_epilogue_iters to remove iterations that cannot be performed
8756        by the vector code.  */
8757   int bias_for_lowest = 1 - min_epilogue_iters;
8758   int bias_for_assumed = bias_for_lowest;
8759   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8760   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8761     {
8762       /* When the amount of peeling is known at compile time, the first
8763          iteration will have exactly alignment_npeels active elements.
8764          In the worst case it will have at least one.  */
8765       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8766       bias_for_lowest += lowest_vf - min_first_active;
8767       bias_for_assumed += assumed_vf - min_first_active;
8768     }
8769   /* In these calculations the "- 1" converts loop iteration counts
8770      back to latch counts.  */
8771   if (loop->any_upper_bound)
8772     loop->nb_iterations_upper_bound
8773       = (final_iter_may_be_partial
8774          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8775                           lowest_vf) - 1
8776          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8777                            lowest_vf) - 1);
8778   if (loop->any_likely_upper_bound)
8779     loop->nb_iterations_likely_upper_bound
8780       = (final_iter_may_be_partial
8781          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8782                           + bias_for_lowest, lowest_vf) - 1
8783          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8784                            + bias_for_lowest, lowest_vf) - 1);
8785   if (loop->any_estimate)
8786     loop->nb_iterations_estimate
8787       = (final_iter_may_be_partial
8788          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8789                           assumed_vf) - 1
8790          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8791                            assumed_vf) - 1);
8792
8793   if (dump_enabled_p ())
8794     {
8795       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8796         {
8797           dump_printf_loc (MSG_NOTE, vect_location,
8798                            "LOOP VECTORIZED\n");
8799           if (loop->inner)
8800             dump_printf_loc (MSG_NOTE, vect_location,
8801                              "OUTER LOOP VECTORIZED\n");
8802           dump_printf (MSG_NOTE, "\n");
8803         }
8804       else
8805         dump_printf_loc (MSG_NOTE, vect_location,
8806                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
8807                          GET_MODE_NAME (loop_vinfo->vector_mode));
8808     }
8809
8810   /* Loops vectorized with a variable factor won't benefit from
8811      unrolling/peeling.  */
8812   if (!vf.is_constant ())
8813     {
8814       loop->unroll = 1;
8815       if (dump_enabled_p ())
8816         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8817                          " variable-length vectorization factor\n");
8818     }
8819   /* Free SLP instances here because otherwise stmt reference counting
8820      won't work.  */
8821   slp_instance instance;
8822   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8823     vect_free_slp_instance (instance, true);
8824   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8825   /* Clear-up safelen field since its value is invalid after vectorization
8826      since vectorized loop can have loop-carried dependencies.  */
8827   loop->safelen = 0;
8828
8829   if (epilogue)
8830     {
8831       update_epilogue_loop_vinfo (epilogue, advance);
8832
8833       epilogue->simduid = loop->simduid;
8834       epilogue->force_vectorize = loop->force_vectorize;
8835       epilogue->dont_vectorize = false;
8836     }
8837
8838   return epilogue;
8839 }
8840
8841 /* The code below is trying to perform simple optimization - revert
8842    if-conversion for masked stores, i.e. if the mask of a store is zero
8843    do not perform it and all stored value producers also if possible.
8844    For example,
8845      for (i=0; i<n; i++)
8846        if (c[i])
8847         {
8848           p1[i] += 1;
8849           p2[i] = p3[i] +2;
8850         }
8851    this transformation will produce the following semi-hammock:
8852
8853    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8854      {
8855        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8856        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8857        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8858        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8859        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8860        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8861      }
8862 */
8863
8864 void
8865 optimize_mask_stores (class loop *loop)
8866 {
8867   basic_block *bbs = get_loop_body (loop);
8868   unsigned nbbs = loop->num_nodes;
8869   unsigned i;
8870   basic_block bb;
8871   class loop *bb_loop;
8872   gimple_stmt_iterator gsi;
8873   gimple *stmt;
8874   auto_vec<gimple *> worklist;
8875   auto_purge_vect_location sentinel;
8876
8877   vect_location = find_loop_location (loop);
8878   /* Pick up all masked stores in loop if any.  */
8879   for (i = 0; i < nbbs; i++)
8880     {
8881       bb = bbs[i];
8882       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8883            gsi_next (&gsi))
8884         {
8885           stmt = gsi_stmt (gsi);
8886           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8887             worklist.safe_push (stmt);
8888         }
8889     }
8890
8891   free (bbs);
8892   if (worklist.is_empty ())
8893     return;
8894
8895   /* Loop has masked stores.  */
8896   while (!worklist.is_empty ())
8897     {
8898       gimple *last, *last_store;
8899       edge e, efalse;
8900       tree mask;
8901       basic_block store_bb, join_bb;
8902       gimple_stmt_iterator gsi_to;
8903       tree vdef, new_vdef;
8904       gphi *phi;
8905       tree vectype;
8906       tree zero;
8907
8908       last = worklist.pop ();
8909       mask = gimple_call_arg (last, 2);
8910       bb = gimple_bb (last);
8911       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8912          the same loop as if_bb.  It could be different to LOOP when two
8913          level loop-nest is vectorized and mask_store belongs to the inner
8914          one.  */
8915       e = split_block (bb, last);
8916       bb_loop = bb->loop_father;
8917       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8918       join_bb = e->dest;
8919       store_bb = create_empty_bb (bb);
8920       add_bb_to_loop (store_bb, bb_loop);
8921       e->flags = EDGE_TRUE_VALUE;
8922       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8923       /* Put STORE_BB to likely part.  */
8924       efalse->probability = profile_probability::unlikely ();
8925       store_bb->count = efalse->count ();
8926       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8927       if (dom_info_available_p (CDI_DOMINATORS))
8928         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8929       if (dump_enabled_p ())
8930         dump_printf_loc (MSG_NOTE, vect_location,
8931                          "Create new block %d to sink mask stores.",
8932                          store_bb->index);
8933       /* Create vector comparison with boolean result.  */
8934       vectype = TREE_TYPE (mask);
8935       zero = build_zero_cst (vectype);
8936       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8937       gsi = gsi_last_bb (bb);
8938       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8939       /* Create new PHI node for vdef of the last masked store:
8940          .MEM_2 = VDEF <.MEM_1>
8941          will be converted to
8942          .MEM.3 = VDEF <.MEM_1>
8943          and new PHI node will be created in join bb
8944          .MEM_2 = PHI <.MEM_1, .MEM_3>
8945       */
8946       vdef = gimple_vdef (last);
8947       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8948       gimple_set_vdef (last, new_vdef);
8949       phi = create_phi_node (vdef, join_bb);
8950       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8951
8952       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8953       while (true)
8954         {
8955           gimple_stmt_iterator gsi_from;
8956           gimple *stmt1 = NULL;
8957
8958           /* Move masked store to STORE_BB.  */
8959           last_store = last;
8960           gsi = gsi_for_stmt (last);
8961           gsi_from = gsi;
8962           /* Shift GSI to the previous stmt for further traversal.  */
8963           gsi_prev (&gsi);
8964           gsi_to = gsi_start_bb (store_bb);
8965           gsi_move_before (&gsi_from, &gsi_to);
8966           /* Setup GSI_TO to the non-empty block start.  */
8967           gsi_to = gsi_start_bb (store_bb);
8968           if (dump_enabled_p ())
8969             dump_printf_loc (MSG_NOTE, vect_location,
8970                              "Move stmt to created bb\n%G", last);
8971           /* Move all stored value producers if possible.  */
8972           while (!gsi_end_p (gsi))
8973             {
8974               tree lhs;
8975               imm_use_iterator imm_iter;
8976               use_operand_p use_p;
8977               bool res;
8978
8979               /* Skip debug statements.  */
8980               if (is_gimple_debug (gsi_stmt (gsi)))
8981                 {
8982                   gsi_prev (&gsi);
8983                   continue;
8984                 }
8985               stmt1 = gsi_stmt (gsi);
8986               /* Do not consider statements writing to memory or having
8987                  volatile operand.  */
8988               if (gimple_vdef (stmt1)
8989                   || gimple_has_volatile_ops (stmt1))
8990                 break;
8991               gsi_from = gsi;
8992               gsi_prev (&gsi);
8993               lhs = gimple_get_lhs (stmt1);
8994               if (!lhs)
8995                 break;
8996
8997               /* LHS of vectorized stmt must be SSA_NAME.  */
8998               if (TREE_CODE (lhs) != SSA_NAME)
8999                 break;
9000
9001               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9002                 {
9003                   /* Remove dead scalar statement.  */
9004                   if (has_zero_uses (lhs))
9005                     {
9006                       gsi_remove (&gsi_from, true);
9007                       continue;
9008                     }
9009                 }
9010
9011               /* Check that LHS does not have uses outside of STORE_BB.  */
9012               res = true;
9013               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9014                 {
9015                   gimple *use_stmt;
9016                   use_stmt = USE_STMT (use_p);
9017                   if (is_gimple_debug (use_stmt))
9018                     continue;
9019                   if (gimple_bb (use_stmt) != store_bb)
9020                     {
9021                       res = false;
9022                       break;
9023                     }
9024                 }
9025               if (!res)
9026                 break;
9027
9028               if (gimple_vuse (stmt1)
9029                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
9030                 break;
9031
9032               /* Can move STMT1 to STORE_BB.  */
9033               if (dump_enabled_p ())
9034                 dump_printf_loc (MSG_NOTE, vect_location,
9035                                  "Move stmt to created bb\n%G", stmt1);
9036               gsi_move_before (&gsi_from, &gsi_to);
9037               /* Shift GSI_TO for further insertion.  */
9038               gsi_prev (&gsi_to);
9039             }
9040           /* Put other masked stores with the same mask to STORE_BB.  */
9041           if (worklist.is_empty ()
9042               || gimple_call_arg (worklist.last (), 2) != mask
9043               || worklist.last () != stmt1)
9044             break;
9045           last = worklist.pop ();
9046         }
9047       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9048     }
9049 }
9050
9051 /* Decide whether it is possible to use a zero-based induction variable
9052    when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,
9053    return the value that the induction variable must be able to hold
9054    in order to ensure that the loop ends with an all-false mask.
9055    Return -1 otherwise.  */
9056 widest_int
9057 vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
9058 {
9059   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9060   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9061   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9062
9063   /* Calculate the value that the induction variable must be able
9064      to hit in order to ensure that we end the loop with an all-false mask.
9065      This involves adding the maximum number of inactive trailing scalar
9066      iterations.  */
9067   widest_int iv_limit = -1;
9068   if (max_loop_iterations (loop, &iv_limit))
9069     {
9070       if (niters_skip)
9071         {
9072           /* Add the maximum number of skipped iterations to the
9073              maximum iteration count.  */
9074           if (TREE_CODE (niters_skip) == INTEGER_CST)
9075             iv_limit += wi::to_widest (niters_skip);
9076           else
9077             iv_limit += max_vf - 1;
9078         }
9079       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9080         /* Make a conservatively-correct assumption.  */
9081         iv_limit += max_vf - 1;
9082
9083       /* IV_LIMIT is the maximum number of latch iterations, which is also
9084          the maximum in-range IV value.  Round this value down to the previous
9085          vector alignment boundary and then add an extra full iteration.  */
9086       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9087       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9088     }
9089   return iv_limit;
9090 }
9091