gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2022 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "diagnostic-core.h"
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "cfganal.h"
  39 #include "gimplify.h"
  40 #include "gimple-iterator.h"
  41 #include "gimplify-me.h"
  42 #include "tree-ssa-loop-ivopts.h"
  43 #include "tree-ssa-loop-manip.h"
  44 #include "tree-ssa-loop-niter.h"
  45 #include "tree-ssa-loop.h"
  46 #include "cfgloop.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57 #include "case-cfn-macros.h"
  58
  59 /* Loop Vectorization Pass.
  60
  61    This pass tries to vectorize loops.
  62
  63    For example, the vectorizer transforms the following simple loop:
  64
  65         short a[N]; short b[N]; short c[N]; int i;
  66
  67         for (i=0; i<N; i++){
  68           a[i] = b[i] + c[i];
  69         }
  70
  71    as if it was manually vectorized by rewriting the source code into:
  72
  73         typedef int __attribute__((mode(V8HI))) v8hi;
  74         short a[N];  short b[N]; short c[N];   int i;
  75         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  76         v8hi va, vb, vc;
  77
  78         for (i=0; i<N/8; i++){
  79           vb = pb[i];
  80           vc = pc[i];
  81           va = vb + vc;
  82           pa[i] = va;
  83         }
  84
  85         The main entry to this pass is vectorize_loops(), in which
  86    the vectorizer applies a set of analyses on a given set of loops,
  87    followed by the actual vectorization transformation for the loops that
  88    had successfully passed the analysis phase.
  89         Throughout this pass we make a distinction between two types of
  90    data: scalars (which are represented by SSA_NAMES), and memory references
  91    ("data-refs").  These two types of data require different handling both
  92    during analysis and transformation. The types of data-refs that the
  93    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  94    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  95    accesses are required to have a simple (consecutive) access pattern.
  96
  97    Analysis phase:
  98    ===============
  99         The driver for the analysis phase is vect_analyze_loop().
 100    It applies a set of analyses, some of which rely on the scalar evolution
 101    analyzer (scev) developed by Sebastian Pop.
 102
 103         During the analysis phase the vectorizer records some information
 104    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 105    loop, as well as general information about the loop as a whole, which is
 106    recorded in a "loop_vec_info" struct attached to each loop.
 107
 108    Transformation phase:
 109    =====================
 110         The loop transformation phase scans all the stmts in the loop, and
 111    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 112    the loop that needs to be vectorized.  It inserts the vector code sequence
 113    just before the scalar stmt S, and records a pointer to the vector code
 114    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 115    attached to S).  This pointer will be used for the vectorization of following
 116    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 117    otherwise, we rely on dead code elimination for removing it.
 118
 119         For example, say stmt S1 was vectorized into stmt VS1:
 120
 121    VS1: vb = px[i];
 122    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 123    S2:  a = b;
 124
 125    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 126    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 127    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 128    resulting sequence would be:
 129
 130    VS1: vb = px[i];
 131    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 132    VS2: va = vb;
 133    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 134
 135         Operands that are not SSA_NAMEs, are data-refs that appear in
 136    load/store operations (like 'x[i]' in S1), and are handled differently.
 137
 138    Target modeling:
 139    =================
 140         Currently the only target specific information that is used is the
 141    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 142    Targets that can support different sizes of vectors, for now will need
 143    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 144    flexibility will be added in the future.
 145
 146         Since we only vectorize operations which vector form can be
 147    expressed using existing tree codes, to verify that an operation is
 148    supported, the vectorizer checks the relevant optab at the relevant
 149    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 150    the value found is CODE_FOR_nothing, then there's no target support, and
 151    we can't vectorize the stmt.
 152
 153    For additional information on this project see:
 154    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 155 */
 156
 157 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 158                                                 unsigned *);
 159 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 160                                                bool *, bool *, bool);
 161
 162 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 163    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 164    may already be set for general statements (not just data refs).  */
 165
 166 static opt_result
 167 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 168                               bool vectype_maybe_set_p,
 169                               poly_uint64 *vf)
 170 {
 171   gimple *stmt = stmt_info->stmt;
 172
 173   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 174        && !STMT_VINFO_LIVE_P (stmt_info))
 175       || gimple_clobber_p (stmt))
 176     {
 177       if (dump_enabled_p ())
 178         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 179       return opt_result::success ();
 180     }
 181
 182   tree stmt_vectype, nunits_vectype;
 183   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 184                                                    &stmt_vectype,
 185                                                    &nunits_vectype);
 186   if (!res)
 187     return res;
 188
 189   if (stmt_vectype)
 190     {
 191       if (STMT_VINFO_VECTYPE (stmt_info))
 192         /* The only case when a vectype had been already set is for stmts
 193            that contain a data ref, or for "pattern-stmts" (stmts generated
 194            by the vectorizer to represent/replace a certain idiom).  */
 195         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 196                      || vectype_maybe_set_p)
 197                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 198       else
 199         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 200     }
 201
 202   if (nunits_vectype)
 203     vect_update_max_nunits (vf, nunits_vectype);
 204
 205   return opt_result::success ();
 206 }
 207
 208 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 209    types of STMT_INFO and all attached pattern statements and update
 210    the vectorization factor VF accordingly.  Return true on success
 211    or false if something prevented vectorization.  */
 212
 213 static opt_result
 214 vect_determine_vf_for_stmt (vec_info *vinfo,
 215                             stmt_vec_info stmt_info, poly_uint64 *vf)
 216 {
 217   if (dump_enabled_p ())
 218     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 219                      stmt_info->stmt);
 220   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 221   if (!res)
 222     return res;
 223
 224   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 225       && STMT_VINFO_RELATED_STMT (stmt_info))
 226     {
 227       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 228       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 229
 230       /* If a pattern statement has def stmts, analyze them too.  */
 231       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 232            !gsi_end_p (si); gsi_next (&si))
 233         {
 234           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 235           if (dump_enabled_p ())
 236             dump_printf_loc (MSG_NOTE, vect_location,
 237                              "==> examining pattern def stmt: %G",
 238                              def_stmt_info->stmt);
 239           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 240           if (!res)
 241             return res;
 242         }
 243
 244       if (dump_enabled_p ())
 245         dump_printf_loc (MSG_NOTE, vect_location,
 246                          "==> examining pattern statement: %G",
 247                          stmt_info->stmt);
 248       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 249       if (!res)
 250         return res;
 251     }
 252
 253   return opt_result::success ();
 254 }
 255
 256 /* Function vect_determine_vectorization_factor
 257
 258    Determine the vectorization factor (VF).  VF is the number of data elements
 259    that are operated upon in parallel in a single iteration of the vectorized
 260    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 261    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 262    elements can fit in a single vector register.
 263
 264    We currently support vectorization of loops in which all types operated upon
 265    are of the same size.  Therefore this function currently sets VF according to
 266    the size of the types operated upon, and fails if there are multiple sizes
 267    in the loop.
 268
 269    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 270    original loop:
 271         for (i=0; i<N; i++){
 272           a[i] = b[i] + c[i];
 273         }
 274
 275    vectorized loop:
 276         for (i=0; i<N; i+=VF){
 277           a[i:VF] = b[i:VF] + c[i:VF];
 278         }
 279 */
 280
 281 static opt_result
 282 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 283 {
 284   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 285   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 286   unsigned nbbs = loop->num_nodes;
 287   poly_uint64 vectorization_factor = 1;
 288   tree scalar_type = NULL_TREE;
 289   gphi *phi;
 290   tree vectype;
 291   stmt_vec_info stmt_info;
 292   unsigned i;
 293
 294   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 295
 296   for (i = 0; i < nbbs; i++)
 297     {
 298       basic_block bb = bbs[i];
 299
 300       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 301            gsi_next (&si))
 302         {
 303           phi = si.phi ();
 304           stmt_info = loop_vinfo->lookup_stmt (phi);
 305           if (dump_enabled_p ())
 306             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 307                              (gimple *) phi);
 308
 309           gcc_assert (stmt_info);
 310
 311           if (STMT_VINFO_RELEVANT_P (stmt_info)
 312               || STMT_VINFO_LIVE_P (stmt_info))
 313             {
 314               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 315               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 316
 317               if (dump_enabled_p ())
 318                 dump_printf_loc (MSG_NOTE, vect_location,
 319                                  "get vectype for scalar type:  %T\n",
 320                                  scalar_type);
 321
 322               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 323               if (!vectype)
 324                 return opt_result::failure_at (phi,
 325                                                "not vectorized: unsupported "
 326                                                "data-type %T\n",
 327                                                scalar_type);
 328               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 329
 330               if (dump_enabled_p ())
 331                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 332                                  vectype);
 333
 334               if (dump_enabled_p ())
 335                 {
 336                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 337                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 338                   dump_printf (MSG_NOTE, "\n");
 339                 }
 340
 341               vect_update_max_nunits (&vectorization_factor, vectype);
 342             }
 343         }
 344
 345       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 346            gsi_next (&si))
 347         {
 348           if (is_gimple_debug (gsi_stmt (si)))
 349             continue;
 350           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 351           opt_result res
 352             = vect_determine_vf_for_stmt (loop_vinfo,
 353                                           stmt_info, &vectorization_factor);
 354           if (!res)
 355             return res;
 356         }
 357     }
 358
 359   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 360   if (dump_enabled_p ())
 361     {
 362       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 363       dump_dec (MSG_NOTE, vectorization_factor);
 364       dump_printf (MSG_NOTE, "\n");
 365     }
 366
 367   if (known_le (vectorization_factor, 1U))
 368     return opt_result::failure_at (vect_location,
 369                                    "not vectorized: unsupported data-type\n");
 370   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 371   return opt_result::success ();
 372 }
 373
 374
 375 /* Function vect_is_simple_iv_evolution.
 376
 377    FORNOW: A simple evolution of an induction variables in the loop is
 378    considered a polynomial evolution.  */
 379
 380 static bool
 381 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 382                              tree * step)
 383 {
 384   tree init_expr;
 385   tree step_expr;
 386   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 387   basic_block bb;
 388
 389   /* When there is no evolution in this loop, the evolution function
 390      is not "simple".  */
 391   if (evolution_part == NULL_TREE)
 392     return false;
 393
 394   /* When the evolution is a polynomial of degree >= 2
 395      the evolution function is not "simple".  */
 396   if (tree_is_chrec (evolution_part))
 397     return false;
 398
 399   step_expr = evolution_part;
 400   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 401
 402   if (dump_enabled_p ())
 403     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 404                      step_expr, init_expr);
 405
 406   *init = init_expr;
 407   *step = step_expr;
 408
 409   if (TREE_CODE (step_expr) != INTEGER_CST
 410       && (TREE_CODE (step_expr) != SSA_NAME
 411           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 412               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 413           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 414               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 415                   || !flag_associative_math)))
 416       && (TREE_CODE (step_expr) != REAL_CST
 417           || !flag_associative_math))
 418     {
 419       if (dump_enabled_p ())
 420         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 421                          "step unknown.\n");
 422       return false;
 423     }
 424
 425   return true;
 426 }
 427
 428 /* Function vect_is_nonlinear_iv_evolution
 429
 430    Only support nonlinear induction for integer type
 431    1. neg
 432    2. mul by constant
 433    3. lshift/rshift by constant.
 434
 435    For neg induction, return a fake step as integer -1.  */
 436 static bool
 437 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
 438                                 gphi* loop_phi_node, tree *init, tree *step)
 439 {
 440   tree init_expr, ev_expr, result, op1, op2;
 441   gimple* def;
 442
 443   if (gimple_phi_num_args (loop_phi_node) != 2)
 444     return false;
 445
 446   init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
 447   ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
 448
 449   /* Support nonlinear induction only for integer type.  */
 450   if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
 451     return false;
 452
 453   *init = init_expr;
 454   result = PHI_RESULT (loop_phi_node);
 455
 456   if (TREE_CODE (ev_expr) != SSA_NAME
 457       || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
 458       || !is_gimple_assign (def))
 459     return false;
 460
 461   enum tree_code t_code = gimple_assign_rhs_code (def);
 462   switch (t_code)
 463     {
 464     case NEGATE_EXPR:
 465       if (gimple_assign_rhs1 (def) != result)
 466         return false;
 467       *step = build_int_cst (TREE_TYPE (init_expr), -1);
 468       STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
 469       break;
 470
 471     case RSHIFT_EXPR:
 472     case LSHIFT_EXPR:
 473     case MULT_EXPR:
 474       op1 = gimple_assign_rhs1 (def);
 475       op2 = gimple_assign_rhs2 (def);
 476       if (TREE_CODE (op2) != INTEGER_CST
 477           || op1 != result)
 478         return false;
 479       *step = op2;
 480       if (t_code == LSHIFT_EXPR)
 481         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
 482       else if (t_code == RSHIFT_EXPR)
 483         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
 484       /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul.  */
 485       else
 486         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
 487       break;
 488
 489     default:
 490       return false;
 491     }
 492
 493   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
 494   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
 495
 496   return true;
 497 }
 498
 499 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 500    what we are assuming is a double reduction.  For example, given
 501    a structure like this:
 502
 503       outer1:
 504         x_1 = PHI <x_4(outer2), ...>;
 505         ...
 506
 507       inner:
 508         x_2 = PHI <x_1(outer1), ...>;
 509         ...
 510         x_3 = ...;
 511         ...
 512
 513       outer2:
 514         x_4 = PHI <x_3(inner)>;
 515         ...
 516
 517    outer loop analysis would treat x_1 as a double reduction phi and
 518    this function would then return true for x_2.  */
 519
 520 static bool
 521 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 522 {
 523   use_operand_p use_p;
 524   ssa_op_iter op_iter;
 525   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 526     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 527       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 528         return true;
 529   return false;
 530 }
 531
 532 /* Returns true if Phi is a first-order recurrence. A first-order
 533    recurrence is a non-reduction recurrence relation in which the value of
 534    the recurrence in the current loop iteration equals a value defined in
 535    the previous iteration.  */
 536
 537 static bool
 538 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
 539                                    gphi *phi)
 540 {
 541   /* Ensure the loop latch definition is from within the loop.  */
 542   edge latch = loop_latch_edge (loop);
 543   tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
 544   if (TREE_CODE (ldef) != SSA_NAME
 545       || SSA_NAME_IS_DEFAULT_DEF (ldef)
 546       || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
 547       || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
 548     return false;
 549
 550   tree def = gimple_phi_result (phi);
 551
 552   /* Ensure every use_stmt of the phi node is dominated by the latch
 553      definition.  */
 554   imm_use_iterator imm_iter;
 555   use_operand_p use_p;
 556   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
 557     if (!is_gimple_debug (USE_STMT (use_p))
 558         && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
 559             || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
 560                                             USE_STMT (use_p))))
 561       return false;
 562
 563   /* First-order recurrence autovectorization needs shuffle vector.  */
 564   tree scalar_type = TREE_TYPE (def);
 565   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 566   if (!vectype)
 567     return false;
 568
 569   return true;
 570 }
 571
 572 /* Function vect_analyze_scalar_cycles_1.
 573
 574    Examine the cross iteration def-use cycles of scalar variables
 575    in LOOP.  LOOP_VINFO represents the loop that is now being
 576    considered for vectorization (can be LOOP, or an outer-loop
 577    enclosing LOOP).  SLP indicates there will be some subsequent
 578    slp analyses or not.  */
 579
 580 static void
 581 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
 582                               bool slp)
 583 {
 584   basic_block bb = loop->header;
 585   tree init, step;
 586   auto_vec<stmt_vec_info, 64> worklist;
 587   gphi_iterator gsi;
 588   bool double_reduc, reduc_chain;
 589
 590   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 591
 592   /* First - identify all inductions.  Reduction detection assumes that all the
 593      inductions have been identified, therefore, this order must not be
 594      changed.  */
 595   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 596     {
 597       gphi *phi = gsi.phi ();
 598       tree access_fn = NULL;
 599       tree def = PHI_RESULT (phi);
 600       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 601
 602       if (dump_enabled_p ())
 603         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 604                          (gimple *) phi);
 605
 606       /* Skip virtual phi's.  The data dependences that are associated with
 607          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 608       if (virtual_operand_p (def))
 609         continue;
 610
 611       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 612
 613       /* Analyze the evolution function.  */
 614       access_fn = analyze_scalar_evolution (loop, def);
 615       if (access_fn)
 616         {
 617           STRIP_NOPS (access_fn);
 618           if (dump_enabled_p ())
 619             dump_printf_loc (MSG_NOTE, vect_location,
 620                              "Access function of PHI: %T\n", access_fn);
 621           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 622             = initial_condition_in_loop_num (access_fn, loop->num);
 623           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 624             = evolution_part_in_loop_num (access_fn, loop->num);
 625         }
 626
 627       if ((!access_fn
 628            || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 629            || !vect_is_simple_iv_evolution (loop->num, access_fn,
 630                                             &init, &step)
 631            || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 632                && TREE_CODE (step) != INTEGER_CST))
 633           /* Only handle nonlinear iv for same loop.  */
 634           && (LOOP_VINFO_LOOP (loop_vinfo) != loop
 635               || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
 636                                                   phi, &init, &step)))
 637         {
 638           worklist.safe_push (stmt_vinfo);
 639           continue;
 640         }
 641
 642       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 643                   != NULL_TREE);
 644       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 645
 646       if (dump_enabled_p ())
 647         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 648       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 649     }
 650
 651
 652   /* Second - identify all reductions and nested cycles.  */
 653   while (worklist.length () > 0)
 654     {
 655       stmt_vec_info stmt_vinfo = worklist.pop ();
 656       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 657       tree def = PHI_RESULT (phi);
 658
 659       if (dump_enabled_p ())
 660         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 661                          (gimple *) phi);
 662
 663       gcc_assert (!virtual_operand_p (def)
 664                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 665
 666       stmt_vec_info reduc_stmt_info
 667         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 668                                     &reduc_chain, slp);
 669       if (reduc_stmt_info)
 670         {
 671           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 672           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 673           if (double_reduc)
 674             {
 675               if (dump_enabled_p ())
 676                 dump_printf_loc (MSG_NOTE, vect_location,
 677                                  "Detected double reduction.\n");
 678
 679               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 680               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 681             }
 682           else
 683             {
 684               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 685                 {
 686                   if (dump_enabled_p ())
 687                     dump_printf_loc (MSG_NOTE, vect_location,
 688                                      "Detected vectorizable nested cycle.\n");
 689
 690                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 691                 }
 692               else
 693                 {
 694                   if (dump_enabled_p ())
 695                     dump_printf_loc (MSG_NOTE, vect_location,
 696                                      "Detected reduction.\n");
 697
 698                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 699                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 700                   /* Store the reduction cycles for possible vectorization in
 701                      loop-aware SLP if it was not detected as reduction
 702                      chain.  */
 703                   if (! reduc_chain)
 704                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 705                       (reduc_stmt_info);
 706                 }
 707             }
 708         }
 709       else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
 710         STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
 711       else
 712         if (dump_enabled_p ())
 713           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 714                            "Unknown def-use cycle pattern.\n");
 715     }
 716 }
 717
 718
 719 /* Function vect_analyze_scalar_cycles.
 720
 721    Examine the cross iteration def-use cycles of scalar variables, by
 722    analyzing the loop-header PHIs of scalar variables.  Classify each
 723    cycle as one of the following: invariant, induction, reduction, unknown.
 724    We do that for the loop represented by LOOP_VINFO, and also to its
 725    inner-loop, if exists.
 726    Examples for scalar cycles:
 727
 728    Example1: reduction:
 729
 730               loop1:
 731               for (i=0; i<N; i++)
 732                  sum += a[i];
 733
 734    Example2: induction:
 735
 736               loop2:
 737               for (i=0; i<N; i++)
 738                  a[i] = i;  */
 739
 740 static void
 741 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
 742 {
 743   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 744
 745   vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
 746
 747   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 748      Reductions in such inner-loop therefore have different properties than
 749      the reductions in the nest that gets vectorized:
 750      1. When vectorized, they are executed in the same order as in the original
 751         scalar loop, so we can't change the order of computation when
 752         vectorizing them.
 753      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 754         current checks are too strict.  */
 755
 756   if (loop->inner)
 757     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
 758 }
 759
 760 /* Transfer group and reduction information from STMT_INFO to its
 761    pattern stmt.  */
 762
 763 static void
 764 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 765 {
 766   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 767   stmt_vec_info stmtp;
 768   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 769               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 770   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 771   do
 772     {
 773       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 774       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 775                            == STMT_VINFO_DEF_TYPE (stmt_info));
 776       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 777       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 778       if (stmt_info)
 779         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 780           = STMT_VINFO_RELATED_STMT (stmt_info);
 781     }
 782   while (stmt_info);
 783 }
 784
 785 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 786
 787 static void
 788 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 789 {
 790   stmt_vec_info first;
 791   unsigned i;
 792
 793   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 794     {
 795       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 796       while (next)
 797         {
 798           if ((STMT_VINFO_IN_PATTERN_P (next)
 799                != STMT_VINFO_IN_PATTERN_P (first))
 800               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 801             break;
 802           next = REDUC_GROUP_NEXT_ELEMENT (next);
 803         }
 804       /* If all reduction chain members are well-formed patterns adjust
 805          the group to group the pattern stmts instead.  */
 806       if (! next
 807           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 808         {
 809           if (STMT_VINFO_IN_PATTERN_P (first))
 810             {
 811               vect_fixup_reduc_chain (first);
 812               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 813                 = STMT_VINFO_RELATED_STMT (first);
 814             }
 815         }
 816       /* If not all stmt in the chain are patterns or if we failed
 817          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 818          it as regular reduction instead.  */
 819       else
 820         {
 821           stmt_vec_info vinfo = first;
 822           stmt_vec_info last = NULL;
 823           while (vinfo)
 824             {
 825               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 826               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 827               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 828               last = vinfo;
 829               vinfo = next;
 830             }
 831           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 832             = vect_internal_def;
 833           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 834           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 835           --i;
 836         }
 837     }
 838 }
 839
 840 /* Function vect_get_loop_niters.
 841
 842    Determine how many iterations the loop is executed and place it
 843    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 844    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 845    niter information holds in ASSUMPTIONS.
 846
 847    Return the loop exit condition.  */
 848
 849
 850 static gcond *
 851 vect_get_loop_niters (class loop *loop, tree *assumptions,
 852                       tree *number_of_iterations, tree *number_of_iterationsm1)
 853 {
 854   edge exit = single_exit (loop);
 855   class tree_niter_desc niter_desc;
 856   tree niter_assumptions, niter, may_be_zero;
 857   gcond *cond = get_loop_exit_condition (loop);
 858
 859   *assumptions = boolean_true_node;
 860   *number_of_iterationsm1 = chrec_dont_know;
 861   *number_of_iterations = chrec_dont_know;
 862   DUMP_VECT_SCOPE ("get_loop_niters");
 863
 864   if (!exit)
 865     return cond;
 866
 867   may_be_zero = NULL_TREE;
 868   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 869       || chrec_contains_undetermined (niter_desc.niter))
 870     return cond;
 871
 872   niter_assumptions = niter_desc.assumptions;
 873   may_be_zero = niter_desc.may_be_zero;
 874   niter = niter_desc.niter;
 875
 876   if (may_be_zero && integer_zerop (may_be_zero))
 877     may_be_zero = NULL_TREE;
 878
 879   if (may_be_zero)
 880     {
 881       if (COMPARISON_CLASS_P (may_be_zero))
 882         {
 883           /* Try to combine may_be_zero with assumptions, this can simplify
 884              computation of niter expression.  */
 885           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 886             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 887                                              niter_assumptions,
 888                                              fold_build1 (TRUTH_NOT_EXPR,
 889                                                           boolean_type_node,
 890                                                           may_be_zero));
 891           else
 892             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 893                                  build_int_cst (TREE_TYPE (niter), 0),
 894                                  rewrite_to_non_trapping_overflow (niter));
 895
 896           may_be_zero = NULL_TREE;
 897         }
 898       else if (integer_nonzerop (may_be_zero))
 899         {
 900           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 901           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 902           return cond;
 903         }
 904       else
 905         return cond;
 906     }
 907
 908   *assumptions = niter_assumptions;
 909   *number_of_iterationsm1 = niter;
 910
 911   /* We want the number of loop header executions which is the number
 912      of latch executions plus one.
 913      ???  For UINT_MAX latch executions this number overflows to zero
 914      for loops like do { n++; } while (n != 0);  */
 915   if (niter && !chrec_contains_undetermined (niter))
 916     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 917                           build_int_cst (TREE_TYPE (niter), 1));
 918   *number_of_iterations = niter;
 919
 920   return cond;
 921 }
 922
 923 /* Function bb_in_loop_p
 924
 925    Used as predicate for dfs order traversal of the loop bbs.  */
 926
 927 static bool
 928 bb_in_loop_p (const_basic_block bb, const void *data)
 929 {
 930   const class loop *const loop = (const class loop *)data;
 931   if (flow_bb_inside_loop_p (loop, bb))
 932     return true;
 933   return false;
 934 }
 935
 936
 937 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 938    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 939
 940 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 941   : vec_info (vec_info::loop, shared),
 942     loop (loop_in),
 943     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 944     num_itersm1 (NULL_TREE),
 945     num_iters (NULL_TREE),
 946     num_iters_unchanged (NULL_TREE),
 947     num_iters_assumptions (NULL_TREE),
 948     vector_costs (nullptr),
 949     scalar_costs (nullptr),
 950     th (0),
 951     versioning_threshold (0),
 952     vectorization_factor (0),
 953     main_loop_edge (nullptr),
 954     skip_main_loop_edge (nullptr),
 955     skip_this_loop_edge (nullptr),
 956     reusable_accumulators (),
 957     suggested_unroll_factor (1),
 958     max_vectorization_factor (0),
 959     mask_skip_niters (NULL_TREE),
 960     rgroup_compare_type (NULL_TREE),
 961     simd_if_cond (NULL_TREE),
 962     unaligned_dr (NULL),
 963     peeling_for_alignment (0),
 964     ptr_mask (0),
 965     ivexpr_map (NULL),
 966     scan_map (NULL),
 967     slp_unrolling_factor (1),
 968     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
 969     vectorizable (false),
 970     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
 971     using_partial_vectors_p (false),
 972     epil_using_partial_vectors_p (false),
 973     partial_load_store_bias (0),
 974     peeling_for_gaps (false),
 975     peeling_for_niter (false),
 976     no_data_dependencies (false),
 977     has_mask_store (false),
 978     scalar_loop_scaling (profile_probability::uninitialized ()),
 979     scalar_loop (NULL),
 980     orig_loop_info (NULL)
 981 {
 982   /* CHECKME: We want to visit all BBs before their successors (except for
 983      latch blocks, for which this assertion wouldn't hold).  In the simple
 984      case of the loop forms we allow, a dfs order of the BBs would the same
 985      as reversed postorder traversal, so we are safe.  */
 986
 987   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 988                                           bbs, loop->num_nodes, loop);
 989   gcc_assert (nbbs == loop->num_nodes);
 990
 991   for (unsigned int i = 0; i < nbbs; i++)
 992     {
 993       basic_block bb = bbs[i];
 994       gimple_stmt_iterator si;
 995
 996       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 997         {
 998           gimple *phi = gsi_stmt (si);
 999           gimple_set_uid (phi, 0);
1000           add_stmt (phi);
1001         }
1002
1003       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1004         {
1005           gimple *stmt = gsi_stmt (si);
1006           gimple_set_uid (stmt, 0);
1007           if (is_gimple_debug (stmt))
1008             continue;
1009           add_stmt (stmt);
1010           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1011              third argument is the #pragma omp simd if (x) condition, when 0,
1012              loop shouldn't be vectorized, when non-zero constant, it should
1013              be vectorized normally, otherwise versioned with vectorized loop
1014              done if the condition is non-zero at runtime.  */
1015           if (loop_in->simduid
1016               && is_gimple_call (stmt)
1017               && gimple_call_internal_p (stmt)
1018               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1019               && gimple_call_num_args (stmt) >= 3
1020               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1021               && (loop_in->simduid
1022                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1023             {
1024               tree arg = gimple_call_arg (stmt, 2);
1025               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1026                 simd_if_cond = arg;
1027               else
1028                 gcc_assert (integer_nonzerop (arg));
1029             }
1030         }
1031     }
1032
1033   epilogue_vinfos.create (6);
1034 }
1035
1036 /* Free all levels of rgroup CONTROLS.  */
1037
1038 void
1039 release_vec_loop_controls (vec<rgroup_controls> *controls)
1040 {
1041   rgroup_controls *rgc;
1042   unsigned int i;
1043   FOR_EACH_VEC_ELT (*controls, i, rgc)
1044     rgc->controls.release ();
1045   controls->release ();
1046 }
1047
1048 /* Free all memory used by the _loop_vec_info, as well as all the
1049    stmt_vec_info structs of all the stmts in the loop.  */
1050
1051 _loop_vec_info::~_loop_vec_info ()
1052 {
1053   free (bbs);
1054
1055   release_vec_loop_controls (&masks);
1056   release_vec_loop_controls (&lens);
1057   delete ivexpr_map;
1058   delete scan_map;
1059   epilogue_vinfos.release ();
1060   delete scalar_costs;
1061   delete vector_costs;
1062
1063   /* When we release an epiloge vinfo that we do not intend to use
1064      avoid clearing AUX of the main loop which should continue to
1065      point to the main loop vinfo since otherwise we'll leak that.  */
1066   if (loop->aux == this)
1067     loop->aux = NULL;
1068 }
1069
1070 /* Return an invariant or register for EXPR and emit necessary
1071    computations in the LOOP_VINFO loop preheader.  */
1072
1073 tree
1074 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1075 {
1076   if (is_gimple_reg (expr)
1077       || is_gimple_min_invariant (expr))
1078     return expr;
1079
1080   if (! loop_vinfo->ivexpr_map)
1081     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1082   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1083   if (! cached)
1084     {
1085       gimple_seq stmts = NULL;
1086       cached = force_gimple_operand (unshare_expr (expr),
1087                                      &stmts, true, NULL_TREE);
1088       if (stmts)
1089         {
1090           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1091           gsi_insert_seq_on_edge_immediate (e, stmts);
1092         }
1093     }
1094   return cached;
1095 }
1096
1097 /* Return true if we can use CMP_TYPE as the comparison type to produce
1098    all masks required to mask LOOP_VINFO.  */
1099
1100 static bool
1101 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1102 {
1103   rgroup_controls *rgm;
1104   unsigned int i;
1105   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1106     if (rgm->type != NULL_TREE
1107         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1108                                             cmp_type, rgm->type,
1109                                             OPTIMIZE_FOR_SPEED))
1110       return false;
1111   return true;
1112 }
1113
1114 /* Calculate the maximum number of scalars per iteration for every
1115    rgroup in LOOP_VINFO.  */
1116
1117 static unsigned int
1118 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1119 {
1120   unsigned int res = 1;
1121   unsigned int i;
1122   rgroup_controls *rgm;
1123   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1124     res = MAX (res, rgm->max_nscalars_per_iter);
1125   return res;
1126 }
1127
1128 /* Calculate the minimum precision necessary to represent:
1129
1130       MAX_NITERS * FACTOR
1131
1132    as an unsigned integer, where MAX_NITERS is the maximum number of
1133    loop header iterations for the original scalar form of LOOP_VINFO.  */
1134
1135 static unsigned
1136 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1137 {
1138   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1139
1140   /* Get the maximum number of iterations that is representable
1141      in the counter type.  */
1142   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1143   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1144
1145   /* Get a more refined estimate for the number of iterations.  */
1146   widest_int max_back_edges;
1147   if (max_loop_iterations (loop, &max_back_edges))
1148     max_ni = wi::smin (max_ni, max_back_edges + 1);
1149
1150   /* Work out how many bits we need to represent the limit.  */
1151   return wi::min_precision (max_ni * factor, UNSIGNED);
1152 }
1153
1154 /* True if the loop needs peeling or partial vectors when vectorized.  */
1155
1156 static bool
1157 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1158 {
1159   unsigned HOST_WIDE_INT const_vf;
1160   HOST_WIDE_INT max_niter
1161     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1162
1163   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1164   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1165     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1166                                           (loop_vinfo));
1167
1168   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1169       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1170     {
1171       /* Work out the (constant) number of iterations that need to be
1172          peeled for reasons other than niters.  */
1173       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1174       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1175         peel_niter += 1;
1176       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1177                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1178         return true;
1179     }
1180   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1181       /* ??? When peeling for gaps but not alignment, we could
1182          try to check whether the (variable) niters is known to be
1183          VF * N + 1.  That's something of a niche case though.  */
1184       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1185       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1186       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1187            < (unsigned) exact_log2 (const_vf))
1188           /* In case of versioning, check if the maximum number of
1189              iterations is greater than th.  If they are identical,
1190              the epilogue is unnecessary.  */
1191           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1192               || ((unsigned HOST_WIDE_INT) max_niter
1193                   > (th / const_vf) * const_vf))))
1194     return true;
1195
1196   return false;
1197 }
1198
1199 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1200    whether we can actually generate the masks required.  Return true if so,
1201    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1202
1203 static bool
1204 vect_verify_full_masking (loop_vec_info loop_vinfo)
1205 {
1206   unsigned int min_ni_width;
1207   unsigned int max_nscalars_per_iter
1208     = vect_get_max_nscalars_per_iter (loop_vinfo);
1209
1210   /* Use a normal loop if there are no statements that need masking.
1211      This only happens in rare degenerate cases: it means that the loop
1212      has no loads, no stores, and no live-out values.  */
1213   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1214     return false;
1215
1216   /* Work out how many bits we need to represent the limit.  */
1217   min_ni_width
1218     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1219
1220   /* Find a scalar mode for which WHILE_ULT is supported.  */
1221   opt_scalar_int_mode cmp_mode_iter;
1222   tree cmp_type = NULL_TREE;
1223   tree iv_type = NULL_TREE;
1224   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1225   unsigned int iv_precision = UINT_MAX;
1226
1227   if (iv_limit != -1)
1228     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1229                                       UNSIGNED);
1230
1231   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1232     {
1233       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1234       if (cmp_bits >= min_ni_width
1235           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1236         {
1237           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1238           if (this_type
1239               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1240             {
1241               /* Although we could stop as soon as we find a valid mode,
1242                  there are at least two reasons why that's not always the
1243                  best choice:
1244
1245                  - An IV that's Pmode or wider is more likely to be reusable
1246                    in address calculations than an IV that's narrower than
1247                    Pmode.
1248
1249                  - Doing the comparison in IV_PRECISION or wider allows
1250                    a natural 0-based IV, whereas using a narrower comparison
1251                    type requires mitigations against wrap-around.
1252
1253                  Conversely, if the IV limit is variable, doing the comparison
1254                  in a wider type than the original type can introduce
1255                  unnecessary extensions, so picking the widest valid mode
1256                  is not always a good choice either.
1257
1258                  Here we prefer the first IV type that's Pmode or wider,
1259                  and the first comparison type that's IV_PRECISION or wider.
1260                  (The comparison type must be no wider than the IV type,
1261                  to avoid extensions in the vector loop.)
1262
1263                  ??? We might want to try continuing beyond Pmode for ILP32
1264                  targets if CMP_BITS < IV_PRECISION.  */
1265               iv_type = this_type;
1266               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1267                 cmp_type = this_type;
1268               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1269                 break;
1270             }
1271         }
1272     }
1273
1274   if (!cmp_type)
1275     return false;
1276
1277   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1278   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1279   return true;
1280 }
1281
1282 /* Check whether we can use vector access with length based on precison
1283    comparison.  So far, to keep it simple, we only allow the case that the
1284    precision of the target supported length is larger than the precision
1285    required by loop niters.  */
1286
1287 static bool
1288 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1289 {
1290   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1291     return false;
1292
1293   machine_mode len_load_mode = get_len_load_store_mode
1294     (loop_vinfo->vector_mode, true).require ();
1295   machine_mode len_store_mode = get_len_load_store_mode
1296     (loop_vinfo->vector_mode, false).require ();
1297
1298   signed char partial_load_bias = internal_len_load_store_bias
1299     (IFN_LEN_LOAD, len_load_mode);
1300
1301   signed char partial_store_bias = internal_len_load_store_bias
1302     (IFN_LEN_STORE, len_store_mode);
1303
1304   gcc_assert (partial_load_bias == partial_store_bias);
1305
1306   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1307     return false;
1308
1309   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1310      len_loads with a length of zero.  In order to avoid that we prohibit
1311      more than one loop length here.  */
1312   if (partial_load_bias == -1
1313       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1314     return false;
1315
1316   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1317
1318   unsigned int max_nitems_per_iter = 1;
1319   unsigned int i;
1320   rgroup_controls *rgl;
1321   /* Find the maximum number of items per iteration for every rgroup.  */
1322   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1323     {
1324       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1325       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1326     }
1327
1328   /* Work out how many bits we need to represent the length limit.  */
1329   unsigned int min_ni_prec
1330     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1331
1332   /* Now use the maximum of below precisions for one suitable IV type:
1333      - the IV's natural precision
1334      - the precision needed to hold: the maximum number of scalar
1335        iterations multiplied by the scale factor (min_ni_prec above)
1336      - the Pmode precision
1337
1338      If min_ni_prec is less than the precision of the current niters,
1339      we perfer to still use the niters type.  Prefer to use Pmode and
1340      wider IV to avoid narrow conversions.  */
1341
1342   unsigned int ni_prec
1343     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1344   min_ni_prec = MAX (min_ni_prec, ni_prec);
1345   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1346
1347   tree iv_type = NULL_TREE;
1348   opt_scalar_int_mode tmode_iter;
1349   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1350     {
1351       scalar_mode tmode = tmode_iter.require ();
1352       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1353
1354       /* ??? Do we really want to construct one IV whose precision exceeds
1355          BITS_PER_WORD?  */
1356       if (tbits > BITS_PER_WORD)
1357         break;
1358
1359       /* Find the first available standard integral type.  */
1360       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1361         {
1362           iv_type = build_nonstandard_integer_type (tbits, true);
1363           break;
1364         }
1365     }
1366
1367   if (!iv_type)
1368     {
1369       if (dump_enabled_p ())
1370         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1371                          "can't vectorize with length-based partial vectors"
1372                          " because there is no suitable iv type.\n");
1373       return false;
1374     }
1375
1376   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1377   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1378
1379   return true;
1380 }
1381
1382 /* Calculate the cost of one scalar iteration of the loop.  */
1383 static void
1384 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1385 {
1386   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1387   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1388   int nbbs = loop->num_nodes, factor;
1389   int innerloop_iters, i;
1390
1391   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1392
1393   /* Gather costs for statements in the scalar loop.  */
1394
1395   /* FORNOW.  */
1396   innerloop_iters = 1;
1397   if (loop->inner)
1398     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1399
1400   for (i = 0; i < nbbs; i++)
1401     {
1402       gimple_stmt_iterator si;
1403       basic_block bb = bbs[i];
1404
1405       if (bb->loop_father == loop->inner)
1406         factor = innerloop_iters;
1407       else
1408         factor = 1;
1409
1410       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1411         {
1412           gimple *stmt = gsi_stmt (si);
1413           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1414
1415           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1416             continue;
1417
1418           /* Skip stmts that are not vectorized inside the loop.  */
1419           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1420           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1421               && (!STMT_VINFO_LIVE_P (vstmt_info)
1422                   || !VECTORIZABLE_CYCLE_DEF
1423                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1424             continue;
1425
1426           vect_cost_for_stmt kind;
1427           if (STMT_VINFO_DATA_REF (stmt_info))
1428             {
1429               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1430                kind = scalar_load;
1431              else
1432                kind = scalar_store;
1433             }
1434           else if (vect_nop_conversion_p (stmt_info))
1435             continue;
1436           else
1437             kind = scalar_stmt;
1438
1439           /* We are using vect_prologue here to avoid scaling twice
1440              by the inner loop factor.  */
1441           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1442                             factor, kind, stmt_info, 0, vect_prologue);
1443         }
1444     }
1445
1446   /* Now accumulate cost.  */
1447   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1448   add_stmt_costs (loop_vinfo->scalar_costs,
1449                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1450   loop_vinfo->scalar_costs->finish_cost (nullptr);
1451 }
1452
1453
1454 /* Function vect_analyze_loop_form.
1455
1456    Verify that certain CFG restrictions hold, including:
1457    - the loop has a pre-header
1458    - the loop has a single entry and exit
1459    - the loop exit condition is simple enough
1460    - the number of iterations can be analyzed, i.e, a countable loop.  The
1461      niter could be analyzed under some assumptions.  */
1462
1463 opt_result
1464 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1465 {
1466   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1467
1468   /* Different restrictions apply when we are considering an inner-most loop,
1469      vs. an outer (nested) loop.
1470      (FORNOW. May want to relax some of these restrictions in the future).  */
1471
1472   info->inner_loop_cond = NULL;
1473   if (!loop->inner)
1474     {
1475       /* Inner-most loop.  We currently require that the number of BBs is
1476          exactly 2 (the header and latch).  Vectorizable inner-most loops
1477          look like this:
1478
1479                         (pre-header)
1480                            |
1481                           header <--------+
1482                            | |            |
1483                            | +--> latch --+
1484                            |
1485                         (exit-bb)  */
1486
1487       if (loop->num_nodes != 2)
1488         return opt_result::failure_at (vect_location,
1489                                        "not vectorized:"
1490                                        " control flow in loop.\n");
1491
1492       if (empty_block_p (loop->header))
1493         return opt_result::failure_at (vect_location,
1494                                        "not vectorized: empty loop.\n");
1495     }
1496   else
1497     {
1498       class loop *innerloop = loop->inner;
1499       edge entryedge;
1500
1501       /* Nested loop. We currently require that the loop is doubly-nested,
1502          contains a single inner loop, and the number of BBs is exactly 5.
1503          Vectorizable outer-loops look like this:
1504
1505                         (pre-header)
1506                            |
1507                           header <---+
1508                            |         |
1509                           inner-loop |
1510                            |         |
1511                           tail ------+
1512                            |
1513                         (exit-bb)
1514
1515          The inner-loop has the properties expected of inner-most loops
1516          as described above.  */
1517
1518       if ((loop->inner)->inner || (loop->inner)->next)
1519         return opt_result::failure_at (vect_location,
1520                                        "not vectorized:"
1521                                        " multiple nested loops.\n");
1522
1523       if (loop->num_nodes != 5)
1524         return opt_result::failure_at (vect_location,
1525                                        "not vectorized:"
1526                                        " control flow in loop.\n");
1527
1528       entryedge = loop_preheader_edge (innerloop);
1529       if (entryedge->src != loop->header
1530           || !single_exit (innerloop)
1531           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1532         return opt_result::failure_at (vect_location,
1533                                        "not vectorized:"
1534                                        " unsupported outerloop form.\n");
1535
1536       /* Analyze the inner-loop.  */
1537       vect_loop_form_info inner;
1538       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1539       if (!res)
1540         {
1541           if (dump_enabled_p ())
1542             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1543                              "not vectorized: Bad inner loop.\n");
1544           return res;
1545         }
1546
1547       /* Don't support analyzing niter under assumptions for inner
1548          loop.  */
1549       if (!integer_onep (inner.assumptions))
1550         return opt_result::failure_at (vect_location,
1551                                        "not vectorized: Bad inner loop.\n");
1552
1553       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1554         return opt_result::failure_at (vect_location,
1555                                        "not vectorized: inner-loop count not"
1556                                        " invariant.\n");
1557
1558       if (dump_enabled_p ())
1559         dump_printf_loc (MSG_NOTE, vect_location,
1560                          "Considering outer-loop vectorization.\n");
1561       info->inner_loop_cond = inner.loop_cond;
1562     }
1563
1564   if (!single_exit (loop))
1565     return opt_result::failure_at (vect_location,
1566                                    "not vectorized: multiple exits.\n");
1567   if (EDGE_COUNT (loop->header->preds) != 2)
1568     return opt_result::failure_at (vect_location,
1569                                    "not vectorized:"
1570                                    " too many incoming edges.\n");
1571
1572   /* We assume that the loop exit condition is at the end of the loop. i.e,
1573      that the loop is represented as a do-while (with a proper if-guard
1574      before the loop if needed), where the loop header contains all the
1575      executable statements, and the latch is empty.  */
1576   if (!empty_block_p (loop->latch)
1577       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1578     return opt_result::failure_at (vect_location,
1579                                    "not vectorized: latch block not empty.\n");
1580
1581   /* Make sure the exit is not abnormal.  */
1582   edge e = single_exit (loop);
1583   if (e->flags & EDGE_ABNORMAL)
1584     return opt_result::failure_at (vect_location,
1585                                    "not vectorized:"
1586                                    " abnormal loop exit edge.\n");
1587
1588   info->loop_cond
1589     = vect_get_loop_niters (loop, &info->assumptions,
1590                             &info->number_of_iterations,
1591                             &info->number_of_iterationsm1);
1592   if (!info->loop_cond)
1593     return opt_result::failure_at
1594       (vect_location,
1595        "not vectorized: complicated exit condition.\n");
1596
1597   if (integer_zerop (info->assumptions)
1598       || !info->number_of_iterations
1599       || chrec_contains_undetermined (info->number_of_iterations))
1600     return opt_result::failure_at
1601       (info->loop_cond,
1602        "not vectorized: number of iterations cannot be computed.\n");
1603
1604   if (integer_zerop (info->number_of_iterations))
1605     return opt_result::failure_at
1606       (info->loop_cond,
1607        "not vectorized: number of iterations = 0.\n");
1608
1609   if (!(tree_fits_shwi_p (info->number_of_iterations)
1610         && tree_to_shwi (info->number_of_iterations) > 0))
1611     {
1612       if (dump_enabled_p ())
1613         {
1614           dump_printf_loc (MSG_NOTE, vect_location,
1615                            "Symbolic number of iterations is ");
1616           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1617           dump_printf (MSG_NOTE, "\n");
1618         }
1619     }
1620
1621   return opt_result::success ();
1622 }
1623
1624 /* Create a loop_vec_info for LOOP with SHARED and the
1625    vect_analyze_loop_form result.  */
1626
1627 loop_vec_info
1628 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1629                         const vect_loop_form_info *info,
1630                         loop_vec_info main_loop_info)
1631 {
1632   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1633   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1634   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1635   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1636   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1637   /* Also record the assumptions for versioning.  */
1638   if (!integer_onep (info->assumptions) && !main_loop_info)
1639     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1640
1641   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1642   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1643   if (info->inner_loop_cond)
1644     {
1645       stmt_vec_info inner_loop_cond_info
1646         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1647       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1648       /* If we have an estimate on the number of iterations of the inner
1649          loop use that to limit the scale for costing, otherwise use
1650          --param vect-inner-loop-cost-factor literally.  */
1651       widest_int nit;
1652       if (estimated_stmt_executions (loop->inner, &nit))
1653         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1654           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1655     }
1656
1657   return loop_vinfo;
1658 }
1659
1660
1661
1662 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1663    statements update the vectorization factor.  */
1664
1665 static void
1666 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1667 {
1668   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1669   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1670   int nbbs = loop->num_nodes;
1671   poly_uint64 vectorization_factor;
1672   int i;
1673
1674   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1675
1676   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1677   gcc_assert (known_ne (vectorization_factor, 0U));
1678
1679   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1680      vectorization factor of the loop is the unrolling factor required by
1681      the SLP instances.  If that unrolling factor is 1, we say, that we
1682      perform pure SLP on loop - cross iteration parallelism is not
1683      exploited.  */
1684   bool only_slp_in_loop = true;
1685   for (i = 0; i < nbbs; i++)
1686     {
1687       basic_block bb = bbs[i];
1688       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1689            gsi_next (&si))
1690         {
1691           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1692           if (!stmt_info)
1693             continue;
1694           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1695                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1696               && !PURE_SLP_STMT (stmt_info))
1697             /* STMT needs both SLP and loop-based vectorization.  */
1698             only_slp_in_loop = false;
1699         }
1700       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1701            gsi_next (&si))
1702         {
1703           if (is_gimple_debug (gsi_stmt (si)))
1704             continue;
1705           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1706           stmt_info = vect_stmt_to_vectorize (stmt_info);
1707           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1708                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1709               && !PURE_SLP_STMT (stmt_info))
1710             /* STMT needs both SLP and loop-based vectorization.  */
1711             only_slp_in_loop = false;
1712         }
1713     }
1714
1715   if (only_slp_in_loop)
1716     {
1717       if (dump_enabled_p ())
1718         dump_printf_loc (MSG_NOTE, vect_location,
1719                          "Loop contains only SLP stmts\n");
1720       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1721     }
1722   else
1723     {
1724       if (dump_enabled_p ())
1725         dump_printf_loc (MSG_NOTE, vect_location,
1726                          "Loop contains SLP and non-SLP stmts\n");
1727       /* Both the vectorization factor and unroll factor have the form
1728          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1729          so they must have a common multiple.  */
1730       vectorization_factor
1731         = force_common_multiple (vectorization_factor,
1732                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1733     }
1734
1735   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1736   if (dump_enabled_p ())
1737     {
1738       dump_printf_loc (MSG_NOTE, vect_location,
1739                        "Updating vectorization factor to ");
1740       dump_dec (MSG_NOTE, vectorization_factor);
1741       dump_printf (MSG_NOTE, ".\n");
1742     }
1743 }
1744
1745 /* Return true if STMT_INFO describes a double reduction phi and if
1746    the other phi in the reduction is also relevant for vectorization.
1747    This rejects cases such as:
1748
1749       outer1:
1750         x_1 = PHI <x_3(outer2), ...>;
1751         ...
1752
1753       inner:
1754         x_2 = ...;
1755         ...
1756
1757       outer2:
1758         x_3 = PHI <x_2(inner)>;
1759
1760    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1761
1762 static bool
1763 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1764 {
1765   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1766     return false;
1767
1768   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1769 }
1770
1771 /* Function vect_analyze_loop_operations.
1772
1773    Scan the loop stmts and make sure they are all vectorizable.  */
1774
1775 static opt_result
1776 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1777 {
1778   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1779   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1780   int nbbs = loop->num_nodes;
1781   int i;
1782   stmt_vec_info stmt_info;
1783   bool need_to_vectorize = false;
1784   bool ok;
1785
1786   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1787
1788   auto_vec<stmt_info_for_cost> cost_vec;
1789
1790   for (i = 0; i < nbbs; i++)
1791     {
1792       basic_block bb = bbs[i];
1793
1794       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1795            gsi_next (&si))
1796         {
1797           gphi *phi = si.phi ();
1798           ok = true;
1799
1800           stmt_info = loop_vinfo->lookup_stmt (phi);
1801           if (dump_enabled_p ())
1802             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
1803                              (gimple *) phi);
1804           if (virtual_operand_p (gimple_phi_result (phi)))
1805             continue;
1806
1807           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1808              (i.e., a phi in the tail of the outer-loop).  */
1809           if (! is_loop_header_bb_p (bb))
1810             {
1811               /* FORNOW: we currently don't support the case that these phis
1812                  are not used in the outerloop (unless it is double reduction,
1813                  i.e., this phi is vect_reduction_def), cause this case
1814                  requires to actually do something here.  */
1815               if (STMT_VINFO_LIVE_P (stmt_info)
1816                   && !vect_active_double_reduction_p (stmt_info))
1817                 return opt_result::failure_at (phi,
1818                                                "Unsupported loop-closed phi"
1819                                                " in outer-loop.\n");
1820
1821               /* If PHI is used in the outer loop, we check that its operand
1822                  is defined in the inner loop.  */
1823               if (STMT_VINFO_RELEVANT_P (stmt_info))
1824                 {
1825                   tree phi_op;
1826
1827                   if (gimple_phi_num_args (phi) != 1)
1828                     return opt_result::failure_at (phi, "unsupported phi");
1829
1830                   phi_op = PHI_ARG_DEF (phi, 0);
1831                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1832                   if (!op_def_info)
1833                     return opt_result::failure_at (phi, "unsupported phi\n");
1834
1835                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1836                       && (STMT_VINFO_RELEVANT (op_def_info)
1837                           != vect_used_in_outer_by_reduction))
1838                     return opt_result::failure_at (phi, "unsupported phi\n");
1839
1840                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1841                        || (STMT_VINFO_DEF_TYPE (stmt_info)
1842                            == vect_double_reduction_def))
1843                       && !vectorizable_lc_phi (loop_vinfo,
1844                                                stmt_info, NULL, NULL))
1845                     return opt_result::failure_at (phi, "unsupported phi\n");
1846                 }
1847
1848               continue;
1849             }
1850
1851           gcc_assert (stmt_info);
1852
1853           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1854                || STMT_VINFO_LIVE_P (stmt_info))
1855               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
1856               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
1857             /* A scalar-dependence cycle that we don't support.  */
1858             return opt_result::failure_at (phi,
1859                                            "not vectorized:"
1860                                            " scalar dependence cycle.\n");
1861
1862           if (STMT_VINFO_RELEVANT_P (stmt_info))
1863             {
1864               need_to_vectorize = true;
1865               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1866                   && ! PURE_SLP_STMT (stmt_info))
1867                 ok = vectorizable_induction (loop_vinfo,
1868                                              stmt_info, NULL, NULL,
1869                                              &cost_vec);
1870               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1871                         || (STMT_VINFO_DEF_TYPE (stmt_info)
1872                             == vect_double_reduction_def)
1873                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1874                        && ! PURE_SLP_STMT (stmt_info))
1875                 ok = vectorizable_reduction (loop_vinfo,
1876                                              stmt_info, NULL, NULL, &cost_vec);
1877               else if ((STMT_VINFO_DEF_TYPE (stmt_info)
1878                         == vect_first_order_recurrence)
1879                        && ! PURE_SLP_STMT (stmt_info))
1880                 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
1881                                            &cost_vec);
1882             }
1883
1884           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1885           if (ok
1886               && STMT_VINFO_LIVE_P (stmt_info)
1887               && !PURE_SLP_STMT (stmt_info))
1888             ok = vectorizable_live_operation (loop_vinfo,
1889                                               stmt_info, NULL, NULL, NULL,
1890                                               -1, false, &cost_vec);
1891
1892           if (!ok)
1893             return opt_result::failure_at (phi,
1894                                            "not vectorized: relevant phi not "
1895                                            "supported: %G",
1896                                            static_cast <gimple *> (phi));
1897         }
1898
1899       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1900            gsi_next (&si))
1901         {
1902           gimple *stmt = gsi_stmt (si);
1903           if (!gimple_clobber_p (stmt)
1904               && !is_gimple_debug (stmt))
1905             {
1906               opt_result res
1907                 = vect_analyze_stmt (loop_vinfo,
1908                                      loop_vinfo->lookup_stmt (stmt),
1909                                      &need_to_vectorize,
1910                                      NULL, NULL, &cost_vec);
1911               if (!res)
1912                 return res;
1913             }
1914         }
1915     } /* bbs */
1916
1917   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
1918
1919   /* All operations in the loop are either irrelevant (deal with loop
1920      control, or dead), or only used outside the loop and can be moved
1921      out of the loop (e.g. invariants, inductions).  The loop can be
1922      optimized away by scalar optimizations.  We're better off not
1923      touching this loop.  */
1924   if (!need_to_vectorize)
1925     {
1926       if (dump_enabled_p ())
1927         dump_printf_loc (MSG_NOTE, vect_location,
1928                          "All the computation can be taken out of the loop.\n");
1929       return opt_result::failure_at
1930         (vect_location,
1931          "not vectorized: redundant loop. no profit to vectorize.\n");
1932     }
1933
1934   return opt_result::success ();
1935 }
1936
1937 /* Return true if we know that the iteration count is smaller than the
1938    vectorization factor.  Return false if it isn't, or if we can't be sure
1939    either way.  */
1940
1941 static bool
1942 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1943 {
1944   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1945
1946   HOST_WIDE_INT max_niter;
1947   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1948     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1949   else
1950     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1951
1952   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1953     return true;
1954
1955   return false;
1956 }
1957
1958 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1959    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1960    definitely no, or -1 if it's worth retrying.  */
1961
1962 static int
1963 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
1964                            unsigned *suggested_unroll_factor)
1965 {
1966   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1967   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1968
1969   /* Only loops that can handle partially-populated vectors can have iteration
1970      counts less than the vectorization factor.  */
1971   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1972     {
1973       if (vect_known_niters_smaller_than_vf (loop_vinfo))
1974         {
1975           if (dump_enabled_p ())
1976             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1977                              "not vectorized: iteration count smaller than "
1978                              "vectorization factor.\n");
1979           return 0;
1980         }
1981     }
1982
1983   /* If using the "very cheap" model. reject cases in which we'd keep
1984      a copy of the scalar code (even if we might be able to vectorize it).  */
1985   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1986       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1987           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1988           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1989     {
1990       if (dump_enabled_p ())
1991         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1992                          "some scalar iterations would need to be peeled\n");
1993       return 0;
1994     }
1995
1996   int min_profitable_iters, min_profitable_estimate;
1997   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1998                                       &min_profitable_estimate,
1999                                       suggested_unroll_factor);
2000
2001   if (min_profitable_iters < 0)
2002     {
2003       if (dump_enabled_p ())
2004         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2005                          "not vectorized: vectorization not profitable.\n");
2006       if (dump_enabled_p ())
2007         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2008                          "not vectorized: vector version will never be "
2009                          "profitable.\n");
2010       return -1;
2011     }
2012
2013   int min_scalar_loop_bound = (param_min_vect_loop_bound
2014                                * assumed_vf);
2015
2016   /* Use the cost model only if it is more conservative than user specified
2017      threshold.  */
2018   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2019                                     min_profitable_iters);
2020
2021   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2022
2023   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2024       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2025     {
2026       if (dump_enabled_p ())
2027         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2028                          "not vectorized: vectorization not profitable.\n");
2029       if (dump_enabled_p ())
2030         dump_printf_loc (MSG_NOTE, vect_location,
2031                          "not vectorized: iteration count smaller than user "
2032                          "specified loop bound parameter or minimum profitable "
2033                          "iterations (whichever is more conservative).\n");
2034       return 0;
2035     }
2036
2037   /* The static profitablity threshold min_profitable_estimate includes
2038      the cost of having to check at runtime whether the scalar loop
2039      should be used instead.  If it turns out that we don't need or want
2040      such a check, the threshold we should use for the static estimate
2041      is simply the point at which the vector loop becomes more profitable
2042      than the scalar loop.  */
2043   if (min_profitable_estimate > min_profitable_iters
2044       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2045       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2046       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2047       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2048     {
2049       if (dump_enabled_p ())
2050         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2051                          " choice between the scalar and vector loops\n");
2052       min_profitable_estimate = min_profitable_iters;
2053     }
2054
2055   /* If the vector loop needs multiple iterations to be beneficial then
2056      things are probably too close to call, and the conservative thing
2057      would be to stick with the scalar code.  */
2058   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2059       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2060     {
2061       if (dump_enabled_p ())
2062         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2063                          "one iteration of the vector loop would be"
2064                          " more expensive than the equivalent number of"
2065                          " iterations of the scalar loop\n");
2066       return 0;
2067     }
2068
2069   HOST_WIDE_INT estimated_niter;
2070
2071   /* If we are vectorizing an epilogue then we know the maximum number of
2072      scalar iterations it will cover is at least one lower than the
2073      vectorization factor of the main loop.  */
2074   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2075     estimated_niter
2076       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2077   else
2078     {
2079       estimated_niter = estimated_stmt_executions_int (loop);
2080       if (estimated_niter == -1)
2081         estimated_niter = likely_max_stmt_executions_int (loop);
2082     }
2083   if (estimated_niter != -1
2084       && ((unsigned HOST_WIDE_INT) estimated_niter
2085           < MAX (th, (unsigned) min_profitable_estimate)))
2086     {
2087       if (dump_enabled_p ())
2088         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2089                          "not vectorized: estimated iteration count too "
2090                          "small.\n");
2091       if (dump_enabled_p ())
2092         dump_printf_loc (MSG_NOTE, vect_location,
2093                          "not vectorized: estimated iteration count smaller "
2094                          "than specified loop bound parameter or minimum "
2095                          "profitable iterations (whichever is more "
2096                          "conservative).\n");
2097       return -1;
2098     }
2099
2100   return 1;
2101 }
2102
2103 static opt_result
2104 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2105                            vec<data_reference_p> *datarefs,
2106                            unsigned int *n_stmts)
2107 {
2108   *n_stmts = 0;
2109   for (unsigned i = 0; i < loop->num_nodes; i++)
2110     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2111          !gsi_end_p (gsi); gsi_next (&gsi))
2112       {
2113         gimple *stmt = gsi_stmt (gsi);
2114         if (is_gimple_debug (stmt))
2115           continue;
2116         ++(*n_stmts);
2117         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2118                                                         NULL, 0);
2119         if (!res)
2120           {
2121             if (is_gimple_call (stmt) && loop->safelen)
2122               {
2123                 tree fndecl = gimple_call_fndecl (stmt), op;
2124                 if (fndecl != NULL_TREE)
2125                   {
2126                     cgraph_node *node = cgraph_node::get (fndecl);
2127                     if (node != NULL && node->simd_clones != NULL)
2128                       {
2129                         unsigned int j, n = gimple_call_num_args (stmt);
2130                         for (j = 0; j < n; j++)
2131                           {
2132                             op = gimple_call_arg (stmt, j);
2133                             if (DECL_P (op)
2134                                 || (REFERENCE_CLASS_P (op)
2135                                     && get_base_address (op)))
2136                               break;
2137                           }
2138                         op = gimple_call_lhs (stmt);
2139                         /* Ignore #pragma omp declare simd functions
2140                            if they don't have data references in the
2141                            call stmt itself.  */
2142                         if (j == n
2143                             && !(op
2144                                  && (DECL_P (op)
2145                                      || (REFERENCE_CLASS_P (op)
2146                                          && get_base_address (op)))))
2147                           continue;
2148                       }
2149                   }
2150               }
2151             return res;
2152           }
2153         /* If dependence analysis will give up due to the limit on the
2154            number of datarefs stop here and fail fatally.  */
2155         if (datarefs->length ()
2156             > (unsigned)param_loop_max_datarefs_for_datadeps)
2157           return opt_result::failure_at (stmt, "exceeded param "
2158                                          "loop-max-datarefs-for-datadeps\n");
2159       }
2160   return opt_result::success ();
2161 }
2162
2163 /* Look for SLP-only access groups and turn each individual access into its own
2164    group.  */
2165 static void
2166 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2167 {
2168   unsigned int i;
2169   struct data_reference *dr;
2170
2171   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2172
2173   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2174   FOR_EACH_VEC_ELT (datarefs, i, dr)
2175     {
2176       gcc_assert (DR_REF (dr));
2177       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2178
2179       /* Check if the load is a part of an interleaving chain.  */
2180       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2181         {
2182           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2183           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2184           unsigned int group_size = DR_GROUP_SIZE (first_element);
2185
2186           /* Check if SLP-only groups.  */
2187           if (!STMT_SLP_TYPE (stmt_info)
2188               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2189             {
2190               /* Dissolve the group.  */
2191               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2192
2193               stmt_vec_info vinfo = first_element;
2194               while (vinfo)
2195                 {
2196                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2197                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2198                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2199                   DR_GROUP_SIZE (vinfo) = 1;
2200                   if (STMT_VINFO_STRIDED_P (first_element))
2201                     DR_GROUP_GAP (vinfo) = 0;
2202                   else
2203                     DR_GROUP_GAP (vinfo) = group_size - 1;
2204                   /* Duplicate and adjust alignment info, it needs to
2205                      be present on each group leader, see dr_misalignment.  */
2206                   if (vinfo != first_element)
2207                     {
2208                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2209                       dr_info2->target_alignment = dr_info->target_alignment;
2210                       int misalignment = dr_info->misalignment;
2211                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2212                         {
2213                           HOST_WIDE_INT diff
2214                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2215                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2216                           unsigned HOST_WIDE_INT align_c
2217                             = dr_info->target_alignment.to_constant ();
2218                           misalignment = (misalignment + diff) % align_c;
2219                         }
2220                       dr_info2->misalignment = misalignment;
2221                     }
2222                   vinfo = next;
2223                 }
2224             }
2225         }
2226     }
2227 }
2228
2229 /* Determine if operating on full vectors for LOOP_VINFO might leave
2230    some scalar iterations still to do.  If so, decide how we should
2231    handle those scalar iterations.  The possibilities are:
2232
2233    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2234        In this case:
2235
2236          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2237          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2238          LOOP_VINFO_PEELING_FOR_NITER == false
2239
2240    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2241        to handle the remaining scalar iterations.  In this case:
2242
2243          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2244          LOOP_VINFO_PEELING_FOR_NITER == true
2245
2246        There are two choices:
2247
2248        (2a) Consider vectorizing the epilogue loop at the same VF as the
2249             main loop, but using partial vectors instead of full vectors.
2250             In this case:
2251
2252               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2253
2254        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2255             In this case:
2256
2257               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2258
2259    When FOR_EPILOGUE_P is true, make this determination based on the
2260    assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2261    based on the assumption that LOOP_VINFO is the main loop.  The caller
2262    has made sure that the number of iterations is set appropriately for
2263    this value of FOR_EPILOGUE_P.  */
2264
2265 opt_result
2266 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2267                                             bool for_epilogue_p)
2268 {
2269   /* Determine whether there would be any scalar iterations left over.  */
2270   bool need_peeling_or_partial_vectors_p
2271     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2272
2273   /* Decide whether to vectorize the loop with partial vectors.  */
2274   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2275   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2276   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2277       && need_peeling_or_partial_vectors_p)
2278     {
2279       /* For partial-vector-usage=1, try to push the handling of partial
2280          vectors to the epilogue, with the main loop continuing to operate
2281          on full vectors.
2282
2283          If we are unrolling we also do not want to use partial vectors. This
2284          is to avoid the overhead of generating multiple masks and also to
2285          avoid having to execute entire iterations of FALSE masked instructions
2286          when dealing with one or less full iterations.
2287
2288          ??? We could then end up failing to use partial vectors if we
2289          decide to peel iterations into a prologue, and if the main loop
2290          then ends up processing fewer than VF iterations.  */
2291       if ((param_vect_partial_vector_usage == 1
2292            || loop_vinfo->suggested_unroll_factor > 1)
2293           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2294           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2295         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2296       else
2297         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2298     }
2299
2300   if (dump_enabled_p ())
2301     {
2302       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2303         dump_printf_loc (MSG_NOTE, vect_location,
2304                          "operating on partial vectors%s.\n",
2305                          for_epilogue_p ? " for epilogue loop" : "");
2306       else
2307         dump_printf_loc (MSG_NOTE, vect_location,
2308                          "operating only on full vectors%s.\n",
2309                          for_epilogue_p ? " for epilogue loop" : "");
2310     }
2311
2312   if (for_epilogue_p)
2313     {
2314       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2315       gcc_assert (orig_loop_vinfo);
2316       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2317         gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2318                               LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2319     }
2320
2321   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2322       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2323     {
2324       /* Check that the loop processes at least one full vector.  */
2325       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2326       tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2327       if (known_lt (wi::to_widest (scalar_niters), vf))
2328         return opt_result::failure_at (vect_location,
2329                                        "loop does not have enough iterations"
2330                                        " to support vectorization.\n");
2331
2332       /* If we need to peel an extra epilogue iteration to handle data
2333          accesses with gaps, check that there are enough scalar iterations
2334          available.
2335
2336          The check above is redundant with this one when peeling for gaps,
2337          but the distinction is useful for diagnostics.  */
2338       tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2339       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2340           && known_lt (wi::to_widest (scalar_nitersm1), vf))
2341         return opt_result::failure_at (vect_location,
2342                                        "loop does not have enough iterations"
2343                                        " to support peeling for gaps.\n");
2344     }
2345
2346   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2347     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2348        && need_peeling_or_partial_vectors_p);
2349
2350   return opt_result::success ();
2351 }
2352
2353 /* Function vect_analyze_loop_2.
2354
2355    Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2356    analyses will record information in some members of LOOP_VINFO.  FATAL
2357    indicates if some analysis meets fatal error.  If one non-NULL pointer
2358    SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2359    worked out suggested unroll factor, while one NULL pointer shows it's
2360    going to apply the suggested unroll factor.  SLP_DONE_FOR_SUGGESTED_UF
2361    is to hold the slp decision when the suggested unroll factor is worked
2362    out.  */
2363 static opt_result
2364 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2365                      unsigned *suggested_unroll_factor,
2366                      bool& slp_done_for_suggested_uf)
2367 {
2368   opt_result ok = opt_result::success ();
2369   int res;
2370   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2371   poly_uint64 min_vf = 2;
2372   loop_vec_info orig_loop_vinfo = NULL;
2373
2374   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2375      loop_vec_info of the first vectorized loop.  */
2376   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2377     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2378   else
2379     orig_loop_vinfo = loop_vinfo;
2380   gcc_assert (orig_loop_vinfo);
2381
2382   /* The first group of checks is independent of the vector size.  */
2383   fatal = true;
2384
2385   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2386       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2387     return opt_result::failure_at (vect_location,
2388                                    "not vectorized: simd if(0)\n");
2389
2390   /* Find all data references in the loop (which correspond to vdefs/vuses)
2391      and analyze their evolution in the loop.  */
2392
2393   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2394
2395   /* Gather the data references and count stmts in the loop.  */
2396   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2397     {
2398       opt_result res
2399         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2400                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2401                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2402       if (!res)
2403         {
2404           if (dump_enabled_p ())
2405             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2406                              "not vectorized: loop contains function "
2407                              "calls or data references that cannot "
2408                              "be analyzed\n");
2409           return res;
2410         }
2411       loop_vinfo->shared->save_datarefs ();
2412     }
2413   else
2414     loop_vinfo->shared->check_datarefs ();
2415
2416   /* Analyze the data references and also adjust the minimal
2417      vectorization factor according to the loads and stores.  */
2418
2419   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2420   if (!ok)
2421     {
2422       if (dump_enabled_p ())
2423         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2424                          "bad data references.\n");
2425       return ok;
2426     }
2427
2428   /* Check if we are applying unroll factor now.  */
2429   bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2430   gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2431
2432   /* If the slp decision is false when suggested unroll factor is worked
2433      out, and we are applying suggested unroll factor, we can simply skip
2434      all slp related analyses this time.  */
2435   bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2436
2437   /* Classify all cross-iteration scalar data-flow cycles.
2438      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2439   vect_analyze_scalar_cycles (loop_vinfo, slp);
2440
2441   vect_pattern_recog (loop_vinfo);
2442
2443   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2444
2445   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2446      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2447
2448   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2449   if (!ok)
2450     {
2451       if (dump_enabled_p ())
2452         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2453                          "bad data access.\n");
2454       return ok;
2455     }
2456
2457   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2458
2459   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2460   if (!ok)
2461     {
2462       if (dump_enabled_p ())
2463         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2464                          "unexpected pattern.\n");
2465       return ok;
2466     }
2467
2468   /* While the rest of the analysis below depends on it in some way.  */
2469   fatal = false;
2470
2471   /* Analyze data dependences between the data-refs in the loop
2472      and adjust the maximum vectorization factor according to
2473      the dependences.
2474      FORNOW: fail at the first data dependence that we encounter.  */
2475
2476   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2477   if (!ok)
2478     {
2479       if (dump_enabled_p ())
2480         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2481                          "bad data dependence.\n");
2482       return ok;
2483     }
2484   if (max_vf != MAX_VECTORIZATION_FACTOR
2485       && maybe_lt (max_vf, min_vf))
2486     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2487   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2488
2489   ok = vect_determine_vectorization_factor (loop_vinfo);
2490   if (!ok)
2491     {
2492       if (dump_enabled_p ())
2493         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2494                          "can't determine vectorization factor.\n");
2495       return ok;
2496     }
2497   if (max_vf != MAX_VECTORIZATION_FACTOR
2498       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2499     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2500
2501   /* Compute the scalar iteration cost.  */
2502   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2503
2504   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2505
2506   if (slp)
2507     {
2508       /* Check the SLP opportunities in the loop, analyze and build
2509          SLP trees.  */
2510       ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2511       if (!ok)
2512         return ok;
2513
2514       /* If there are any SLP instances mark them as pure_slp.  */
2515       slp = vect_make_slp_decision (loop_vinfo);
2516       if (slp)
2517         {
2518           /* Find stmts that need to be both vectorized and SLPed.  */
2519           vect_detect_hybrid_slp (loop_vinfo);
2520
2521           /* Update the vectorization factor based on the SLP decision.  */
2522           vect_update_vf_for_slp (loop_vinfo);
2523
2524           /* Optimize the SLP graph with the vectorization factor fixed.  */
2525           vect_optimize_slp (loop_vinfo);
2526
2527           /* Gather the loads reachable from the SLP graph entries.  */
2528           vect_gather_slp_loads (loop_vinfo);
2529         }
2530     }
2531
2532   bool saved_can_use_partial_vectors_p
2533     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2534
2535   /* We don't expect to have to roll back to anything other than an empty
2536      set of rgroups.  */
2537   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2538
2539   /* This is the point where we can re-start analysis with SLP forced off.  */
2540 start_over:
2541
2542   /* Apply the suggested unrolling factor, this was determined by the backend
2543      during finish_cost the first time we ran the analyzis for this
2544      vector mode.  */
2545   if (applying_suggested_uf)
2546     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2547
2548   /* Now the vectorization factor is final.  */
2549   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2550   gcc_assert (known_ne (vectorization_factor, 0U));
2551
2552   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2553     {
2554       dump_printf_loc (MSG_NOTE, vect_location,
2555                        "vectorization_factor = ");
2556       dump_dec (MSG_NOTE, vectorization_factor);
2557       dump_printf (MSG_NOTE, ", niters = %wd\n",
2558                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2559     }
2560
2561   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2562
2563   /* Analyze the alignment of the data-refs in the loop.
2564      Fail if a data reference is found that cannot be vectorized.  */
2565
2566   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2567   if (!ok)
2568     {
2569       if (dump_enabled_p ())
2570         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2571                          "bad data alignment.\n");
2572       return ok;
2573     }
2574
2575   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2576      It is important to call pruning after vect_analyze_data_ref_accesses,
2577      since we use grouping information gathered by interleaving analysis.  */
2578   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2579   if (!ok)
2580     return ok;
2581
2582   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2583      vectorization, since we do not want to add extra peeling or
2584      add versioning for alignment.  */
2585   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2586     /* This pass will decide on using loop versioning and/or loop peeling in
2587        order to enhance the alignment of data references in the loop.  */
2588     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2589   if (!ok)
2590     return ok;
2591
2592   if (slp)
2593     {
2594       /* Analyze operations in the SLP instances.  Note this may
2595          remove unsupported SLP instances which makes the above
2596          SLP kind detection invalid.  */
2597       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2598       vect_slp_analyze_operations (loop_vinfo);
2599       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2600         {
2601           ok = opt_result::failure_at (vect_location,
2602                                        "unsupported SLP instances\n");
2603           goto again;
2604         }
2605
2606       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2607       slp_tree load_node, slp_root;
2608       unsigned i, x;
2609       slp_instance instance;
2610       bool can_use_lanes = true;
2611       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2612         {
2613           slp_root = SLP_INSTANCE_TREE (instance);
2614           int group_size = SLP_TREE_LANES (slp_root);
2615           tree vectype = SLP_TREE_VECTYPE (slp_root);
2616           bool loads_permuted = false;
2617           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2618             {
2619               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2620                 continue;
2621               unsigned j;
2622               stmt_vec_info load_info;
2623               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2624                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2625                   {
2626                     loads_permuted = true;
2627                     break;
2628                   }
2629             }
2630
2631           /* If the loads and stores can be handled with load/store-lane
2632              instructions record it and move on to the next instance.  */
2633           if (loads_permuted
2634               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2635               && vect_store_lanes_supported (vectype, group_size, false))
2636             {
2637               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2638                 {
2639                   stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2640                       (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2641                   /* Use SLP for strided accesses (or if we can't
2642                      load-lanes).  */
2643                   if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2644                       || ! vect_load_lanes_supported
2645                             (STMT_VINFO_VECTYPE (stmt_vinfo),
2646                              DR_GROUP_SIZE (stmt_vinfo), false))
2647                     break;
2648                 }
2649
2650               can_use_lanes
2651                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2652
2653               if (can_use_lanes && dump_enabled_p ())
2654                 dump_printf_loc (MSG_NOTE, vect_location,
2655                                  "SLP instance %p can use load/store-lanes\n",
2656                                  (void *) instance);
2657             }
2658           else
2659             {
2660               can_use_lanes = false;
2661               break;
2662             }
2663         }
2664
2665       /* If all SLP instances can use load/store-lanes abort SLP and try again
2666          with SLP disabled.  */
2667       if (can_use_lanes)
2668         {
2669           ok = opt_result::failure_at (vect_location,
2670                                        "Built SLP cancelled: can use "
2671                                        "load/store-lanes\n");
2672           if (dump_enabled_p ())
2673             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2674                              "Built SLP cancelled: all SLP instances support "
2675                              "load/store-lanes\n");
2676           goto again;
2677         }
2678     }
2679
2680   /* Dissolve SLP-only groups.  */
2681   vect_dissolve_slp_only_groups (loop_vinfo);
2682
2683   /* Scan all the remaining operations in the loop that are not subject
2684      to SLP and make sure they are vectorizable.  */
2685   ok = vect_analyze_loop_operations (loop_vinfo);
2686   if (!ok)
2687     {
2688       if (dump_enabled_p ())
2689         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2690                          "bad operation or unsupported loop bound.\n");
2691       return ok;
2692     }
2693
2694   /* For now, we don't expect to mix both masking and length approaches for one
2695      loop, disable it if both are recorded.  */
2696   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2697       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2698       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2699     {
2700       if (dump_enabled_p ())
2701         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2702                          "can't vectorize a loop with partial vectors"
2703                          " because we don't expect to mix different"
2704                          " approaches with partial vectors for the"
2705                          " same loop.\n");
2706       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2707     }
2708
2709   /* If we still have the option of using partial vectors,
2710      check whether we can generate the necessary loop controls.  */
2711   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2712       && !vect_verify_full_masking (loop_vinfo)
2713       && !vect_verify_loop_lens (loop_vinfo))
2714     LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2715
2716   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2717      to be able to handle fewer than VF scalars, or needs to have a lower VF
2718      than the main loop.  */
2719   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2720       && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2721       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2722                    LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2723     return opt_result::failure_at (vect_location,
2724                                    "Vectorization factor too high for"
2725                                    " epilogue loop.\n");
2726
2727   /* Decide whether this loop_vinfo should use partial vectors or peeling,
2728      assuming that the loop will be used as a main loop.  We will redo
2729      this analysis later if we instead decide to use the loop as an
2730      epilogue loop.  */
2731   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2732   if (!ok)
2733     return ok;
2734
2735   /* Check the costings of the loop make vectorizing worthwhile.  */
2736   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
2737   if (res < 0)
2738     {
2739       ok = opt_result::failure_at (vect_location,
2740                                    "Loop costings may not be worthwhile.\n");
2741       goto again;
2742     }
2743   if (!res)
2744     return opt_result::failure_at (vect_location,
2745                                    "Loop costings not worthwhile.\n");
2746
2747   /* If an epilogue loop is required make sure we can create one.  */
2748   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2749       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2750     {
2751       if (dump_enabled_p ())
2752         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2753       if (!vect_can_advance_ivs_p (loop_vinfo)
2754           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2755                                            single_exit (LOOP_VINFO_LOOP
2756                                                          (loop_vinfo))))
2757         {
2758           ok = opt_result::failure_at (vect_location,
2759                                        "not vectorized: can't create required "
2760                                        "epilog loop\n");
2761           goto again;
2762         }
2763     }
2764
2765   /* During peeling, we need to check if number of loop iterations is
2766      enough for both peeled prolog loop and vector loop.  This check
2767      can be merged along with threshold check of loop versioning, so
2768      increase threshold for this case if necessary.
2769
2770      If we are analyzing an epilogue we still want to check what its
2771      versioning threshold would be.  If we decide to vectorize the epilogues we
2772      will want to use the lowest versioning threshold of all epilogues and main
2773      loop.  This will enable us to enter a vectorized epilogue even when
2774      versioning the loop.  We can't simply check whether the epilogue requires
2775      versioning though since we may have skipped some versioning checks when
2776      analyzing the epilogue.  For instance, checks for alias versioning will be
2777      skipped when dealing with epilogues as we assume we already checked them
2778      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2779   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2780     {
2781       poly_uint64 niters_th = 0;
2782       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2783
2784       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2785         {
2786           /* Niters for peeled prolog loop.  */
2787           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2788             {
2789               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2790               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2791               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2792             }
2793           else
2794             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2795         }
2796
2797       /* Niters for at least one iteration of vectorized loop.  */
2798       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2799         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2800       /* One additional iteration because of peeling for gap.  */
2801       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2802         niters_th += 1;
2803
2804       /*  Use the same condition as vect_transform_loop to decide when to use
2805           the cost to determine a versioning threshold.  */
2806       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2807           && ordered_p (th, niters_th))
2808         niters_th = ordered_max (poly_uint64 (th), niters_th);
2809
2810       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2811     }
2812
2813   gcc_assert (known_eq (vectorization_factor,
2814                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2815
2816   slp_done_for_suggested_uf = slp;
2817
2818   /* Ok to vectorize!  */
2819   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2820   return opt_result::success ();
2821
2822 again:
2823   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2824   gcc_assert (!ok);
2825
2826   /* Try again with SLP forced off but if we didn't do any SLP there is
2827      no point in re-trying.  */
2828   if (!slp)
2829     return ok;
2830
2831   /* If the slp decision is true when suggested unroll factor is worked
2832      out, and we are applying suggested unroll factor, we don't need to
2833      re-try any more.  */
2834   if (applying_suggested_uf && slp_done_for_suggested_uf)
2835     return ok;
2836
2837   /* If there are reduction chains re-trying will fail anyway.  */
2838   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2839     return ok;
2840
2841   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2842      via interleaving or lane instructions.  */
2843   slp_instance instance;
2844   slp_tree node;
2845   unsigned i, j;
2846   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2847     {
2848       stmt_vec_info vinfo;
2849       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2850       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2851         continue;
2852       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2853       unsigned int size = DR_GROUP_SIZE (vinfo);
2854       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2855       if (! vect_store_lanes_supported (vectype, size, false)
2856          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2857          && ! vect_grouped_store_supported (vectype, size))
2858         return opt_result::failure_at (vinfo->stmt,
2859                                        "unsupported grouped store\n");
2860       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2861         {
2862           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2863           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2864           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2865           size = DR_GROUP_SIZE (vinfo);
2866           vectype = STMT_VINFO_VECTYPE (vinfo);
2867           if (! vect_load_lanes_supported (vectype, size, false)
2868               && ! vect_grouped_load_supported (vectype, single_element_p,
2869                                                 size))
2870             return opt_result::failure_at (vinfo->stmt,
2871                                            "unsupported grouped load\n");
2872         }
2873     }
2874
2875   if (dump_enabled_p ())
2876     dump_printf_loc (MSG_NOTE, vect_location,
2877                      "re-trying with SLP disabled\n");
2878
2879   /* Roll back state appropriately.  No SLP this time.  */
2880   slp = false;
2881   /* Restore vectorization factor as it were without SLP.  */
2882   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2883   /* Free the SLP instances.  */
2884   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2885     vect_free_slp_instance (instance);
2886   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2887   /* Reset SLP type to loop_vect on all stmts.  */
2888   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2889     {
2890       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2891       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2892            !gsi_end_p (si); gsi_next (&si))
2893         {
2894           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2895           STMT_SLP_TYPE (stmt_info) = loop_vect;
2896           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2897               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2898             {
2899               /* vectorizable_reduction adjusts reduction stmt def-types,
2900                  restore them to that of the PHI.  */
2901               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2902                 = STMT_VINFO_DEF_TYPE (stmt_info);
2903               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2904                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
2905                 = STMT_VINFO_DEF_TYPE (stmt_info);
2906             }
2907         }
2908       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2909            !gsi_end_p (si); gsi_next (&si))
2910         {
2911           if (is_gimple_debug (gsi_stmt (si)))
2912             continue;
2913           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2914           STMT_SLP_TYPE (stmt_info) = loop_vect;
2915           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2916             {
2917               stmt_vec_info pattern_stmt_info
2918                 = STMT_VINFO_RELATED_STMT (stmt_info);
2919               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2920                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2921
2922               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2923               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2924               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2925                    !gsi_end_p (pi); gsi_next (&pi))
2926                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2927                   = loop_vect;
2928             }
2929         }
2930     }
2931   /* Free optimized alias test DDRS.  */
2932   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2933   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2934   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2935   /* Reset target cost data.  */
2936   delete loop_vinfo->vector_costs;
2937   loop_vinfo->vector_costs = nullptr;
2938   /* Reset accumulated rgroup information.  */
2939   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2940   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2941   /* Reset assorted flags.  */
2942   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2943   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2944   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2945   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2946   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2947     = saved_can_use_partial_vectors_p;
2948
2949   goto start_over;
2950 }
2951
2952 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2953    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2954    OLD_LOOP_VINFO is better unless something specifically indicates
2955    otherwise.
2956
2957    Note that this deliberately isn't a partial order.  */
2958
2959 static bool
2960 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2961                           loop_vec_info old_loop_vinfo)
2962 {
2963   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2964   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2965
2966   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2967   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2968
2969   /* Always prefer a VF of loop->simdlen over any other VF.  */
2970   if (loop->simdlen)
2971     {
2972       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2973       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2974       if (new_simdlen_p != old_simdlen_p)
2975         return new_simdlen_p;
2976     }
2977
2978   const auto *old_costs = old_loop_vinfo->vector_costs;
2979   const auto *new_costs = new_loop_vinfo->vector_costs;
2980   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2981     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2982
2983   return new_costs->better_main_loop_than_p (old_costs);
2984 }
2985
2986 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2987    true if we should.  */
2988
2989 static bool
2990 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2991                         loop_vec_info old_loop_vinfo)
2992 {
2993   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2994     return false;
2995
2996   if (dump_enabled_p ())
2997     dump_printf_loc (MSG_NOTE, vect_location,
2998                      "***** Preferring vector mode %s to vector mode %s\n",
2999                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
3000                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
3001   return true;
3002 }
3003
3004 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3005    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3006    MODE_I to the next mode useful to analyze.
3007    Return the loop_vinfo on success and wrapped null on failure.  */
3008
3009 static opt_loop_vec_info
3010 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3011                      const vect_loop_form_info *loop_form_info,
3012                      loop_vec_info main_loop_vinfo,
3013                      const vector_modes &vector_modes, unsigned &mode_i,
3014                      machine_mode &autodetected_vector_mode,
3015                      bool &fatal)
3016 {
3017   loop_vec_info loop_vinfo
3018     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3019
3020   machine_mode vector_mode = vector_modes[mode_i];
3021   loop_vinfo->vector_mode = vector_mode;
3022   unsigned int suggested_unroll_factor = 1;
3023   bool slp_done_for_suggested_uf;
3024
3025   /* Run the main analysis.  */
3026   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3027                                         &suggested_unroll_factor,
3028                                         slp_done_for_suggested_uf);
3029   if (dump_enabled_p ())
3030     dump_printf_loc (MSG_NOTE, vect_location,
3031                      "***** Analysis %s with vector mode %s\n",
3032                      res ? "succeeded" : " failed",
3033                      GET_MODE_NAME (loop_vinfo->vector_mode));
3034
3035   if (!main_loop_vinfo && suggested_unroll_factor > 1)
3036     {
3037       if (dump_enabled_p ())
3038         dump_printf_loc (MSG_NOTE, vect_location,
3039                          "***** Re-trying analysis for unrolling"
3040                          " with unroll factor %d and slp %s.\n",
3041                          suggested_unroll_factor,
3042                          slp_done_for_suggested_uf ? "on" : "off");
3043       loop_vec_info unroll_vinfo
3044         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3045       unroll_vinfo->vector_mode = vector_mode;
3046       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3047       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3048                                                 slp_done_for_suggested_uf);
3049       if (new_res)
3050         {
3051           delete loop_vinfo;
3052           loop_vinfo = unroll_vinfo;
3053         }
3054       else
3055         delete unroll_vinfo;
3056     }
3057
3058   /* Remember the autodetected vector mode.  */
3059   if (vector_mode == VOIDmode)
3060     autodetected_vector_mode = loop_vinfo->vector_mode;
3061
3062   /* Advance mode_i, first skipping modes that would result in the
3063      same analysis result.  */
3064   while (mode_i + 1 < vector_modes.length ()
3065          && vect_chooses_same_modes_p (loop_vinfo,
3066                                        vector_modes[mode_i + 1]))
3067     {
3068       if (dump_enabled_p ())
3069         dump_printf_loc (MSG_NOTE, vect_location,
3070                          "***** The result for vector mode %s would"
3071                          " be the same\n",
3072                          GET_MODE_NAME (vector_modes[mode_i + 1]));
3073       mode_i += 1;
3074     }
3075   if (mode_i + 1 < vector_modes.length ()
3076       && VECTOR_MODE_P (autodetected_vector_mode)
3077       && (related_vector_mode (vector_modes[mode_i + 1],
3078                                GET_MODE_INNER (autodetected_vector_mode))
3079           == autodetected_vector_mode)
3080       && (related_vector_mode (autodetected_vector_mode,
3081                                GET_MODE_INNER (vector_modes[mode_i + 1]))
3082           == vector_modes[mode_i + 1]))
3083     {
3084       if (dump_enabled_p ())
3085         dump_printf_loc (MSG_NOTE, vect_location,
3086                          "***** Skipping vector mode %s, which would"
3087                          " repeat the analysis for %s\n",
3088                          GET_MODE_NAME (vector_modes[mode_i + 1]),
3089                          GET_MODE_NAME (autodetected_vector_mode));
3090       mode_i += 1;
3091     }
3092   mode_i++;
3093
3094   if (!res)
3095     {
3096       delete loop_vinfo;
3097       if (fatal)
3098         gcc_checking_assert (main_loop_vinfo == NULL);
3099       return opt_loop_vec_info::propagate_failure (res);
3100     }
3101
3102   return opt_loop_vec_info::success (loop_vinfo);
3103 }
3104
3105 /* Function vect_analyze_loop.
3106
3107    Apply a set of analyses on LOOP, and create a loop_vec_info struct
3108    for it.  The different analyses will record information in the
3109    loop_vec_info struct.  */
3110 opt_loop_vec_info
3111 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3112 {
3113   DUMP_VECT_SCOPE ("analyze_loop_nest");
3114
3115   if (loop_outer (loop)
3116       && loop_vec_info_for_loop (loop_outer (loop))
3117       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3118     return opt_loop_vec_info::failure_at (vect_location,
3119                                           "outer-loop already vectorized.\n");
3120
3121   if (!find_loop_nest (loop, &shared->loop_nest))
3122     return opt_loop_vec_info::failure_at
3123       (vect_location,
3124        "not vectorized: loop nest containing two or more consecutive inner"
3125        " loops cannot be vectorized\n");
3126
3127   /* Analyze the loop form.  */
3128   vect_loop_form_info loop_form_info;
3129   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3130   if (!res)
3131     {
3132       if (dump_enabled_p ())
3133         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3134                          "bad loop form.\n");
3135       return opt_loop_vec_info::propagate_failure (res);
3136     }
3137   if (!integer_onep (loop_form_info.assumptions))
3138     {
3139       /* We consider to vectorize this loop by versioning it under
3140          some assumptions.  In order to do this, we need to clear
3141          existing information computed by scev and niter analyzer.  */
3142       scev_reset_htab ();
3143       free_numbers_of_iterations_estimates (loop);
3144       /* Also set flag for this loop so that following scev and niter
3145          analysis are done under the assumptions.  */
3146       loop_constraint_set (loop, LOOP_C_FINITE);
3147     }
3148
3149   auto_vector_modes vector_modes;
3150   /* Autodetect first vector size we try.  */
3151   vector_modes.safe_push (VOIDmode);
3152   unsigned int autovec_flags
3153     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3154                                                     loop->simdlen != 0);
3155   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3156                              && !unlimited_cost_model (loop));
3157   machine_mode autodetected_vector_mode = VOIDmode;
3158   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3159   unsigned int mode_i = 0;
3160   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3161
3162   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3163      a mode has not been analyzed.  */
3164   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3165   for (unsigned i = 0; i < vector_modes.length (); ++i)
3166     cached_vf_per_mode.safe_push (0);
3167
3168   /* First determine the main loop vectorization mode, either the first
3169      one that works, starting with auto-detecting the vector mode and then
3170      following the targets order of preference, or the one with the
3171      lowest cost if pick_lowest_cost_p.  */
3172   while (1)
3173     {
3174       bool fatal;
3175       unsigned int last_mode_i = mode_i;
3176       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3177          failed.  */
3178       cached_vf_per_mode[last_mode_i] = -1;
3179       opt_loop_vec_info loop_vinfo
3180         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3181                                NULL, vector_modes, mode_i,
3182                                autodetected_vector_mode, fatal);
3183       if (fatal)
3184         break;
3185
3186       if (loop_vinfo)
3187         {
3188           /*  Analyzis has been successful so update the VF value.  The
3189               VF should always be a multiple of unroll_factor and we want to
3190               capture the original VF here.  */
3191           cached_vf_per_mode[last_mode_i]
3192             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3193                          loop_vinfo->suggested_unroll_factor);
3194           /* Once we hit the desired simdlen for the first time,
3195              discard any previous attempts.  */
3196           if (simdlen
3197               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3198             {
3199               delete first_loop_vinfo;
3200               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3201               simdlen = 0;
3202             }
3203           else if (pick_lowest_cost_p
3204                    && first_loop_vinfo
3205                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3206             {
3207               /* Pick loop_vinfo over first_loop_vinfo.  */
3208               delete first_loop_vinfo;
3209               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3210             }
3211           if (first_loop_vinfo == NULL)
3212             first_loop_vinfo = loop_vinfo;
3213           else
3214             {
3215               delete loop_vinfo;
3216               loop_vinfo = opt_loop_vec_info::success (NULL);
3217             }
3218
3219           /* Commit to first_loop_vinfo if we have no reason to try
3220              alternatives.  */
3221           if (!simdlen && !pick_lowest_cost_p)
3222             break;
3223         }
3224       if (mode_i == vector_modes.length ()
3225           || autodetected_vector_mode == VOIDmode)
3226         break;
3227
3228       /* Try the next biggest vector size.  */
3229       if (dump_enabled_p ())
3230         dump_printf_loc (MSG_NOTE, vect_location,
3231                          "***** Re-trying analysis with vector mode %s\n",
3232                          GET_MODE_NAME (vector_modes[mode_i]));
3233     }
3234   if (!first_loop_vinfo)
3235     return opt_loop_vec_info::propagate_failure (res);
3236
3237   if (dump_enabled_p ())
3238     dump_printf_loc (MSG_NOTE, vect_location,
3239                      "***** Choosing vector mode %s\n",
3240                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3241
3242   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3243      enabled, SIMDUID is not set, it is the innermost loop and we have
3244      either already found the loop's SIMDLEN or there was no SIMDLEN to
3245      begin with.
3246      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3247   bool vect_epilogues = (!simdlen
3248                          && loop->inner == NULL
3249                          && param_vect_epilogues_nomask
3250                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3251                          && !loop->simduid);
3252   if (!vect_epilogues)
3253     return first_loop_vinfo;
3254
3255   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3256   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3257
3258   /* For epilogues start the analysis from the first mode.  The motivation
3259      behind starting from the beginning comes from cases where the VECTOR_MODES
3260      array may contain length-agnostic and length-specific modes.  Their
3261      ordering is not guaranteed, so we could end up picking a mode for the main
3262      loop that is after the epilogue's optimal mode.  */
3263   vector_modes[0] = autodetected_vector_mode;
3264   mode_i = 0;
3265
3266   bool supports_partial_vectors =
3267     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3268   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3269
3270   while (1)
3271     {
3272       /* If the target does not support partial vectors we can shorten the
3273          number of modes to analyze for the epilogue as we know we can't pick a
3274          mode that would lead to a VF at least as big as the
3275          FIRST_VINFO_VF.  */
3276       if (!supports_partial_vectors
3277           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3278         {
3279           mode_i++;
3280           if (mode_i == vector_modes.length ())
3281             break;
3282           continue;
3283         }
3284
3285       if (dump_enabled_p ())
3286         dump_printf_loc (MSG_NOTE, vect_location,
3287                          "***** Re-trying epilogue analysis with vector "
3288                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3289
3290       bool fatal;
3291       opt_loop_vec_info loop_vinfo
3292         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3293                                first_loop_vinfo,
3294                                vector_modes, mode_i,
3295                                autodetected_vector_mode, fatal);
3296       if (fatal)
3297         break;
3298
3299       if (loop_vinfo)
3300         {
3301           if (pick_lowest_cost_p)
3302             {
3303               /* Keep trying to roll back vectorization attempts while the
3304                  loop_vec_infos they produced were worse than this one.  */
3305               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3306               while (!vinfos.is_empty ()
3307                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3308                 {
3309                   gcc_assert (vect_epilogues);
3310                   delete vinfos.pop ();
3311                 }
3312             }
3313           /* For now only allow one epilogue loop.  */
3314           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3315             {
3316               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3317               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3318               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3319                           || maybe_ne (lowest_th, 0U));
3320               /* Keep track of the known smallest versioning
3321                  threshold.  */
3322               if (ordered_p (lowest_th, th))
3323                 lowest_th = ordered_min (lowest_th, th);
3324             }
3325           else
3326             {
3327               delete loop_vinfo;
3328               loop_vinfo = opt_loop_vec_info::success (NULL);
3329             }
3330
3331           /* For now only allow one epilogue loop, but allow
3332              pick_lowest_cost_p to replace it, so commit to the
3333              first epilogue if we have no reason to try alternatives.  */
3334           if (!pick_lowest_cost_p)
3335             break;
3336         }
3337
3338       if (mode_i == vector_modes.length ())
3339         break;
3340
3341     }
3342
3343   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3344     {
3345       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3346       if (dump_enabled_p ())
3347         dump_printf_loc (MSG_NOTE, vect_location,
3348                          "***** Choosing epilogue vector mode %s\n",
3349                          GET_MODE_NAME
3350                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3351     }
3352
3353   return first_loop_vinfo;
3354 }
3355
3356 /* Return true if there is an in-order reduction function for CODE, storing
3357    it in *REDUC_FN if so.  */
3358
3359 static bool
3360 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3361 {
3362   if (code == PLUS_EXPR)
3363     {
3364       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3365       return true;
3366     }
3367   return false;
3368 }
3369
3370 /* Function reduction_fn_for_scalar_code
3371
3372    Input:
3373    CODE - tree_code of a reduction operations.
3374
3375    Output:
3376    REDUC_FN - the corresponding internal function to be used to reduce the
3377       vector of partial results into a single scalar result, or IFN_LAST
3378       if the operation is a supported reduction operation, but does not have
3379       such an internal function.
3380
3381    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3382
3383 bool
3384 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3385 {
3386   if (code.is_tree_code ())
3387     switch (tree_code (code))
3388       {
3389       case MAX_EXPR:
3390         *reduc_fn = IFN_REDUC_MAX;
3391         return true;
3392
3393       case MIN_EXPR:
3394         *reduc_fn = IFN_REDUC_MIN;
3395         return true;
3396
3397       case PLUS_EXPR:
3398         *reduc_fn = IFN_REDUC_PLUS;
3399         return true;
3400
3401       case BIT_AND_EXPR:
3402         *reduc_fn = IFN_REDUC_AND;
3403         return true;
3404
3405       case BIT_IOR_EXPR:
3406         *reduc_fn = IFN_REDUC_IOR;
3407         return true;
3408
3409       case BIT_XOR_EXPR:
3410         *reduc_fn = IFN_REDUC_XOR;
3411         return true;
3412
3413       case MULT_EXPR:
3414       case MINUS_EXPR:
3415         *reduc_fn = IFN_LAST;
3416         return true;
3417
3418       default:
3419         return false;
3420       }
3421   else
3422     switch (combined_fn (code))
3423       {
3424       CASE_CFN_FMAX:
3425         *reduc_fn = IFN_REDUC_FMAX;
3426         return true;
3427
3428       CASE_CFN_FMIN:
3429         *reduc_fn = IFN_REDUC_FMIN;
3430         return true;
3431
3432       default:
3433         return false;
3434       }
3435 }
3436
3437 /* If there is a neutral value X such that a reduction would not be affected
3438    by the introduction of additional X elements, return that X, otherwise
3439    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3440    of the scalar elements.  If the reduction has just a single initial value
3441    then INITIAL_VALUE is that value, otherwise it is null.  */
3442
3443 tree
3444 neutral_op_for_reduction (tree scalar_type, code_helper code,
3445                           tree initial_value)
3446 {
3447   if (code.is_tree_code ())
3448     switch (tree_code (code))
3449       {
3450       case WIDEN_SUM_EXPR:
3451       case DOT_PROD_EXPR:
3452       case SAD_EXPR:
3453       case PLUS_EXPR:
3454       case MINUS_EXPR:
3455       case BIT_IOR_EXPR:
3456       case BIT_XOR_EXPR:
3457         return build_zero_cst (scalar_type);
3458
3459       case MULT_EXPR:
3460         return build_one_cst (scalar_type);
3461
3462       case BIT_AND_EXPR:
3463         return build_all_ones_cst (scalar_type);
3464
3465       case MAX_EXPR:
3466       case MIN_EXPR:
3467         return initial_value;
3468
3469       default:
3470         return NULL_TREE;
3471       }
3472   else
3473     switch (combined_fn (code))
3474       {
3475       CASE_CFN_FMIN:
3476       CASE_CFN_FMAX:
3477         return initial_value;
3478
3479       default:
3480         return NULL_TREE;
3481       }
3482 }
3483
3484 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3485    STMT is printed with a message MSG. */
3486
3487 static void
3488 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3489 {
3490   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3491 }
3492
3493 /* Return true if we need an in-order reduction for operation CODE
3494    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3495    overflow must wrap.  */
3496
3497 bool
3498 needs_fold_left_reduction_p (tree type, code_helper code)
3499 {
3500   /* CHECKME: check for !flag_finite_math_only too?  */
3501   if (SCALAR_FLOAT_TYPE_P (type))
3502     {
3503       if (code.is_tree_code ())
3504         switch (tree_code (code))
3505           {
3506           case MIN_EXPR:
3507           case MAX_EXPR:
3508             return false;
3509
3510           default:
3511             return !flag_associative_math;
3512           }
3513       else
3514         switch (combined_fn (code))
3515           {
3516           CASE_CFN_FMIN:
3517           CASE_CFN_FMAX:
3518             return false;
3519
3520           default:
3521             return !flag_associative_math;
3522           }
3523     }
3524
3525   if (INTEGRAL_TYPE_P (type))
3526     return (!code.is_tree_code ()
3527             || !operation_no_trapping_overflow (type, tree_code (code)));
3528
3529   if (SAT_FIXED_POINT_TYPE_P (type))
3530     return true;
3531
3532   return false;
3533 }
3534
3535 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3536    has a handled computation expression.  Store the main reduction
3537    operation in *CODE.  */
3538
3539 static bool
3540 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3541                       tree loop_arg, code_helper *code,
3542                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3543 {
3544   auto_bitmap visited;
3545   tree lookfor = PHI_RESULT (phi);
3546   ssa_op_iter curri;
3547   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3548   while (USE_FROM_PTR (curr) != loop_arg)
3549     curr = op_iter_next_use (&curri);
3550   curri.i = curri.numops;
3551   do
3552     {
3553       path.safe_push (std::make_pair (curri, curr));
3554       tree use = USE_FROM_PTR (curr);
3555       if (use == lookfor)
3556         break;
3557       gimple *def = SSA_NAME_DEF_STMT (use);
3558       if (gimple_nop_p (def)
3559           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3560         {
3561 pop:
3562           do
3563             {
3564               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3565               curri = x.first;
3566               curr = x.second;
3567               do
3568                 curr = op_iter_next_use (&curri);
3569               /* Skip already visited or non-SSA operands (from iterating
3570                  over PHI args).  */
3571               while (curr != NULL_USE_OPERAND_P
3572                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3573                          || ! bitmap_set_bit (visited,
3574                                               SSA_NAME_VERSION
3575                                                 (USE_FROM_PTR (curr)))));
3576             }
3577           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3578           if (curr == NULL_USE_OPERAND_P)
3579             break;
3580         }
3581       else
3582         {
3583           if (gimple_code (def) == GIMPLE_PHI)
3584             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3585           else
3586             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3587           while (curr != NULL_USE_OPERAND_P
3588                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3589                      || ! bitmap_set_bit (visited,
3590                                           SSA_NAME_VERSION
3591                                             (USE_FROM_PTR (curr)))))
3592             curr = op_iter_next_use (&curri);
3593           if (curr == NULL_USE_OPERAND_P)
3594             goto pop;
3595         }
3596     }
3597   while (1);
3598   if (dump_file && (dump_flags & TDF_DETAILS))
3599     {
3600       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3601       unsigned i;
3602       std::pair<ssa_op_iter, use_operand_p> *x;
3603       FOR_EACH_VEC_ELT (path, i, x)
3604         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3605       dump_printf (MSG_NOTE, "\n");
3606     }
3607
3608   /* Check whether the reduction path detected is valid.  */
3609   bool fail = path.length () == 0;
3610   bool neg = false;
3611   int sign = -1;
3612   *code = ERROR_MARK;
3613   for (unsigned i = 1; i < path.length (); ++i)
3614     {
3615       gimple *use_stmt = USE_STMT (path[i].second);
3616       gimple_match_op op;
3617       if (!gimple_extract_op (use_stmt, &op))
3618         {
3619           fail = true;
3620           break;
3621         }
3622       unsigned int opi = op.num_ops;
3623       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3624         {
3625           /* The following make sure we can compute the operand index
3626              easily plus it mostly disallows chaining via COND_EXPR condition
3627              operands.  */
3628           for (opi = 0; opi < op.num_ops; ++opi)
3629             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3630               break;
3631         }
3632       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3633         {
3634           for (opi = 0; opi < op.num_ops; ++opi)
3635             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3636               break;
3637         }
3638       if (opi == op.num_ops)
3639         {
3640           fail = true;
3641           break;
3642         }
3643       op.code = canonicalize_code (op.code, op.type);
3644       if (op.code == MINUS_EXPR)
3645         {
3646           op.code = PLUS_EXPR;
3647           /* Track whether we negate the reduction value each iteration.  */
3648           if (op.ops[1] == op.ops[opi])
3649             neg = ! neg;
3650         }
3651       if (CONVERT_EXPR_CODE_P (op.code)
3652           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3653         ;
3654       else if (*code == ERROR_MARK)
3655         {
3656           *code = op.code;
3657           sign = TYPE_SIGN (op.type);
3658         }
3659       else if (op.code != *code)
3660         {
3661           fail = true;
3662           break;
3663         }
3664       else if ((op.code == MIN_EXPR
3665                 || op.code == MAX_EXPR)
3666                && sign != TYPE_SIGN (op.type))
3667         {
3668           fail = true;
3669           break;
3670         }
3671       /* Check there's only a single stmt the op is used on.  For the
3672          not value-changing tail and the last stmt allow out-of-loop uses.
3673          ???  We could relax this and handle arbitrary live stmts by
3674          forcing a scalar epilogue for example.  */
3675       imm_use_iterator imm_iter;
3676       gimple *op_use_stmt;
3677       unsigned cnt = 0;
3678       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3679         if (!is_gimple_debug (op_use_stmt)
3680             && (*code != ERROR_MARK
3681                 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3682           {
3683             /* We want to allow x + x but not x < 1 ? x : 2.  */
3684             if (is_gimple_assign (op_use_stmt)
3685                 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3686               {
3687                 use_operand_p use_p;
3688                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3689                   cnt++;
3690               }
3691             else
3692               cnt++;
3693           }
3694       if (cnt != 1)
3695         {
3696           fail = true;
3697           break;
3698         }
3699     }
3700   return ! fail && ! neg && *code != ERROR_MARK;
3701 }
3702
3703 bool
3704 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3705                       tree loop_arg, enum tree_code code)
3706 {
3707   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3708   code_helper code_;
3709   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3710           && code_ == code);
3711 }
3712
3713
3714
3715 /* Function vect_is_simple_reduction
3716
3717    (1) Detect a cross-iteration def-use cycle that represents a simple
3718    reduction computation.  We look for the following pattern:
3719
3720    loop_header:
3721      a1 = phi < a0, a2 >
3722      a3 = ...
3723      a2 = operation (a3, a1)
3724
3725    or
3726
3727    a3 = ...
3728    loop_header:
3729      a1 = phi < a0, a2 >
3730      a2 = operation (a3, a1)
3731
3732    such that:
3733    1. operation is commutative and associative and it is safe to
3734       change the order of the computation
3735    2. no uses for a2 in the loop (a2 is used out of the loop)
3736    3. no uses of a1 in the loop besides the reduction operation
3737    4. no uses of a1 outside the loop.
3738
3739    Conditions 1,4 are tested here.
3740    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3741
3742    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3743    nested cycles.
3744
3745    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3746    reductions:
3747
3748      a1 = phi < a0, a2 >
3749      inner loop (def of a3)
3750      a2 = phi < a3 >
3751
3752    (4) Detect condition expressions, ie:
3753      for (int i = 0; i < N; i++)
3754        if (a[i] < val)
3755         ret_val = a[i];
3756
3757 */
3758
3759 static stmt_vec_info
3760 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3761                           bool *double_reduc, bool *reduc_chain_p, bool slp)
3762 {
3763   gphi *phi = as_a <gphi *> (phi_info->stmt);
3764   gimple *phi_use_stmt = NULL;
3765   imm_use_iterator imm_iter;
3766   use_operand_p use_p;
3767
3768   *double_reduc = false;
3769   *reduc_chain_p = false;
3770   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3771
3772   tree phi_name = PHI_RESULT (phi);
3773   /* ???  If there are no uses of the PHI result the inner loop reduction
3774      won't be detected as possibly double-reduction by vectorizable_reduction
3775      because that tries to walk the PHI arg from the preheader edge which
3776      can be constant.  See PR60382.  */
3777   if (has_zero_uses (phi_name))
3778     return NULL;
3779   class loop *loop = (gimple_bb (phi))->loop_father;
3780   unsigned nphi_def_loop_uses = 0;
3781   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3782     {
3783       gimple *use_stmt = USE_STMT (use_p);
3784       if (is_gimple_debug (use_stmt))
3785         continue;
3786
3787       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3788         {
3789           if (dump_enabled_p ())
3790             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3791                              "intermediate value used outside loop.\n");
3792
3793           return NULL;
3794         }
3795
3796       nphi_def_loop_uses++;
3797       phi_use_stmt = use_stmt;
3798     }
3799
3800   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3801   if (TREE_CODE (latch_def) != SSA_NAME)
3802     {
3803       if (dump_enabled_p ())
3804         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3805                          "reduction: not ssa_name: %T\n", latch_def);
3806       return NULL;
3807     }
3808
3809   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3810   if (!def_stmt_info
3811       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3812     return NULL;
3813
3814   bool nested_in_vect_loop
3815     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3816   unsigned nlatch_def_loop_uses = 0;
3817   auto_vec<gphi *, 3> lcphis;
3818   bool inner_loop_of_double_reduc = false;
3819   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3820     {
3821       gimple *use_stmt = USE_STMT (use_p);
3822       if (is_gimple_debug (use_stmt))
3823         continue;
3824       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3825         nlatch_def_loop_uses++;
3826       else
3827         {
3828           /* We can have more than one loop-closed PHI.  */
3829           lcphis.safe_push (as_a <gphi *> (use_stmt));
3830           if (nested_in_vect_loop
3831               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3832                   == vect_double_reduction_def))
3833             inner_loop_of_double_reduc = true;
3834         }
3835     }
3836
3837   /* If we are vectorizing an inner reduction we are executing that
3838      in the original order only in case we are not dealing with a
3839      double reduction.  */
3840   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3841     {
3842       if (dump_enabled_p ())
3843         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3844                         "detected nested cycle: ");
3845       return def_stmt_info;
3846     }
3847
3848   /* When the inner loop of a double reduction ends up with more than
3849      one loop-closed PHI we have failed to classify alternate such
3850      PHIs as double reduction, leading to wrong code.  See PR103237.  */
3851   if (inner_loop_of_double_reduc && lcphis.length () != 1)
3852     {
3853       if (dump_enabled_p ())
3854         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3855                          "unhandle double reduction\n");
3856       return NULL;
3857     }
3858
3859   /* If this isn't a nested cycle or if the nested cycle reduction value
3860      is used ouside of the inner loop we cannot handle uses of the reduction
3861      value.  */
3862   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3863     {
3864       if (dump_enabled_p ())
3865         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3866                          "reduction used in loop.\n");
3867       return NULL;
3868     }
3869
3870   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3871      defined in the inner loop.  */
3872   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3873     {
3874       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3875       if (gimple_phi_num_args (def_stmt) != 1
3876           || TREE_CODE (op1) != SSA_NAME)
3877         {
3878           if (dump_enabled_p ())
3879             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3880                              "unsupported phi node definition.\n");
3881
3882           return NULL;
3883         }
3884
3885       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3886       if (gimple_bb (def1)
3887           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3888           && loop->inner
3889           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3890           && (is_gimple_assign (def1) || is_gimple_call (def1))
3891           && is_a <gphi *> (phi_use_stmt)
3892           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3893         {
3894           if (dump_enabled_p ())
3895             report_vect_op (MSG_NOTE, def_stmt,
3896                             "detected double reduction: ");
3897
3898           *double_reduc = true;
3899           return def_stmt_info;
3900         }
3901
3902       return NULL;
3903     }
3904
3905   /* Look for the expression computing latch_def from then loop PHI result.  */
3906   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3907   code_helper code;
3908   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3909                             path))
3910     {
3911       STMT_VINFO_REDUC_CODE (phi_info) = code;
3912       if (code == COND_EXPR && !nested_in_vect_loop)
3913         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3914
3915       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3916          reduction chain for which the additional restriction is that
3917          all operations in the chain are the same.  */
3918       auto_vec<stmt_vec_info, 8> reduc_chain;
3919       unsigned i;
3920       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3921       for (i = path.length () - 1; i >= 1; --i)
3922         {
3923           gimple *stmt = USE_STMT (path[i].second);
3924           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3925           gimple_match_op op;
3926           if (!gimple_extract_op (stmt, &op))
3927             gcc_unreachable ();
3928           if (gassign *assign = dyn_cast<gassign *> (stmt))
3929             STMT_VINFO_REDUC_IDX (stmt_info)
3930               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
3931           else
3932             {
3933               gcall *call = as_a<gcall *> (stmt);
3934               STMT_VINFO_REDUC_IDX (stmt_info)
3935                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
3936             }
3937           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
3938                                      && (i == 1 || i == path.length () - 1));
3939           if ((op.code != code && !leading_conversion)
3940               /* We can only handle the final value in epilogue
3941                  generation for reduction chains.  */
3942               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
3943             is_slp_reduc = false;
3944           /* For reduction chains we support a trailing/leading
3945              conversions.  We do not store those in the actual chain.  */
3946           if (leading_conversion)
3947             continue;
3948           reduc_chain.safe_push (stmt_info);
3949         }
3950       if (slp && is_slp_reduc && reduc_chain.length () > 1)
3951         {
3952           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3953             {
3954               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3955               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3956             }
3957           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3958           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3959
3960           /* Save the chain for further analysis in SLP detection.  */
3961           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3962           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3963
3964           *reduc_chain_p = true;
3965           if (dump_enabled_p ())
3966             dump_printf_loc (MSG_NOTE, vect_location,
3967                             "reduction: detected reduction chain\n");
3968         }
3969       else if (dump_enabled_p ())
3970         dump_printf_loc (MSG_NOTE, vect_location,
3971                          "reduction: detected reduction\n");
3972
3973       return def_stmt_info;
3974     }
3975
3976   if (dump_enabled_p ())
3977     dump_printf_loc (MSG_NOTE, vect_location,
3978                      "reduction: unknown pattern\n");
3979
3980   return NULL;
3981 }
3982
3983 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3984    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3985    or -1 if not known.  */
3986
3987 static int
3988 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3989 {
3990   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3991   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3992     {
3993       if (dump_enabled_p ())
3994         dump_printf_loc (MSG_NOTE, vect_location,
3995                          "cost model: epilogue peel iters set to vf/2 "
3996                          "because loop iterations are unknown .\n");
3997       return assumed_vf / 2;
3998     }
3999   else
4000     {
4001       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4002       peel_iters_prologue = MIN (niters, peel_iters_prologue);
4003       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4004       /* If we need to peel for gaps, but no peeling is required, we have to
4005          peel VF iterations.  */
4006       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4007         peel_iters_epilogue = assumed_vf;
4008       return peel_iters_epilogue;
4009     }
4010 }
4011
4012 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
4013 int
4014 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4015                              int *peel_iters_epilogue,
4016                              stmt_vector_for_cost *scalar_cost_vec,
4017                              stmt_vector_for_cost *prologue_cost_vec,
4018                              stmt_vector_for_cost *epilogue_cost_vec)
4019 {
4020   int retval = 0;
4021
4022   *peel_iters_epilogue
4023     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4024
4025   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4026     {
4027       /* If peeled iterations are known but number of scalar loop
4028          iterations are unknown, count a taken branch per peeled loop.  */
4029       if (peel_iters_prologue > 0)
4030         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4031                                    vect_prologue);
4032       if (*peel_iters_epilogue > 0)
4033         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4034                                     vect_epilogue);
4035     }
4036
4037   stmt_info_for_cost *si;
4038   int j;
4039   if (peel_iters_prologue)
4040     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4041       retval += record_stmt_cost (prologue_cost_vec,
4042                                   si->count * peel_iters_prologue,
4043                                   si->kind, si->stmt_info, si->misalign,
4044                                   vect_prologue);
4045   if (*peel_iters_epilogue)
4046     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4047       retval += record_stmt_cost (epilogue_cost_vec,
4048                                   si->count * *peel_iters_epilogue,
4049                                   si->kind, si->stmt_info, si->misalign,
4050                                   vect_epilogue);
4051
4052   return retval;
4053 }
4054
4055 /* Function vect_estimate_min_profitable_iters
4056
4057    Return the number of iterations required for the vector version of the
4058    loop to be profitable relative to the cost of the scalar version of the
4059    loop.
4060
4061    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4062    of iterations for vectorization.  -1 value means loop vectorization
4063    is not profitable.  This returned value may be used for dynamic
4064    profitability check.
4065
4066    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4067    for static check against estimated number of iterations.  */
4068
4069 static void
4070 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4071                                     int *ret_min_profitable_niters,
4072                                     int *ret_min_profitable_estimate,
4073                                     unsigned *suggested_unroll_factor)
4074 {
4075   int min_profitable_iters;
4076   int min_profitable_estimate;
4077   int peel_iters_prologue;
4078   int peel_iters_epilogue;
4079   unsigned vec_inside_cost = 0;
4080   int vec_outside_cost = 0;
4081   unsigned vec_prologue_cost = 0;
4082   unsigned vec_epilogue_cost = 0;
4083   int scalar_single_iter_cost = 0;
4084   int scalar_outside_cost = 0;
4085   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4086   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4087   vector_costs *target_cost_data = loop_vinfo->vector_costs;
4088
4089   /* Cost model disabled.  */
4090   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4091     {
4092       if (dump_enabled_p ())
4093         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4094       *ret_min_profitable_niters = 0;
4095       *ret_min_profitable_estimate = 0;
4096       return;
4097     }
4098
4099   /* Requires loop versioning tests to handle misalignment.  */
4100   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4101     {
4102       /*  FIXME: Make cost depend on complexity of individual check.  */
4103       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4104       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4105       if (dump_enabled_p ())
4106         dump_printf (MSG_NOTE,
4107                      "cost model: Adding cost of checks for loop "
4108                      "versioning to treat misalignment.\n");
4109     }
4110
4111   /* Requires loop versioning with alias checks.  */
4112   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4113     {
4114       /*  FIXME: Make cost depend on complexity of individual check.  */
4115       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4116       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4117       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4118       if (len)
4119         /* Count LEN - 1 ANDs and LEN comparisons.  */
4120         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4121                               scalar_stmt, vect_prologue);
4122       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4123       if (len)
4124         {
4125           /* Count LEN - 1 ANDs and LEN comparisons.  */
4126           unsigned int nstmts = len * 2 - 1;
4127           /* +1 for each bias that needs adding.  */
4128           for (unsigned int i = 0; i < len; ++i)
4129             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4130               nstmts += 1;
4131           (void) add_stmt_cost (target_cost_data, nstmts,
4132                                 scalar_stmt, vect_prologue);
4133         }
4134       if (dump_enabled_p ())
4135         dump_printf (MSG_NOTE,
4136                      "cost model: Adding cost of checks for loop "
4137                      "versioning aliasing.\n");
4138     }
4139
4140   /* Requires loop versioning with niter checks.  */
4141   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4142     {
4143       /*  FIXME: Make cost depend on complexity of individual check.  */
4144       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4145                             NULL, NULL, NULL_TREE, 0, vect_prologue);
4146       if (dump_enabled_p ())
4147         dump_printf (MSG_NOTE,
4148                      "cost model: Adding cost of checks for loop "
4149                      "versioning niters.\n");
4150     }
4151
4152   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4153     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4154                           vect_prologue);
4155
4156   /* Count statements in scalar loop.  Using this as scalar cost for a single
4157      iteration for now.
4158
4159      TODO: Add outer loop support.
4160
4161      TODO: Consider assigning different costs to different scalar
4162      statements.  */
4163
4164   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4165
4166   /* Add additional cost for the peeled instructions in prologue and epilogue
4167      loop.  (For fully-masked loops there will be no peeling.)
4168
4169      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4170      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4171
4172      TODO: Build an expression that represents peel_iters for prologue and
4173      epilogue to be used in a run-time test.  */
4174
4175   bool prologue_need_br_taken_cost = false;
4176   bool prologue_need_br_not_taken_cost = false;
4177
4178   /* Calculate peel_iters_prologue.  */
4179   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4180     peel_iters_prologue = 0;
4181   else if (npeel < 0)
4182     {
4183       peel_iters_prologue = assumed_vf / 2;
4184       if (dump_enabled_p ())
4185         dump_printf (MSG_NOTE, "cost model: "
4186                      "prologue peel iters set to vf/2.\n");
4187
4188       /* If peeled iterations are unknown, count a taken branch and a not taken
4189          branch per peeled loop.  Even if scalar loop iterations are known,
4190          vector iterations are not known since peeled prologue iterations are
4191          not known.  Hence guards remain the same.  */
4192       prologue_need_br_taken_cost = true;
4193       prologue_need_br_not_taken_cost = true;
4194     }
4195   else
4196     {
4197       peel_iters_prologue = npeel;
4198       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4199         /* If peeled iterations are known but number of scalar loop
4200            iterations are unknown, count a taken branch per peeled loop.  */
4201         prologue_need_br_taken_cost = true;
4202     }
4203
4204   bool epilogue_need_br_taken_cost = false;
4205   bool epilogue_need_br_not_taken_cost = false;
4206
4207   /* Calculate peel_iters_epilogue.  */
4208   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4209     /* We need to peel exactly one iteration for gaps.  */
4210     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4211   else if (npeel < 0)
4212     {
4213       /* If peeling for alignment is unknown, loop bound of main loop
4214          becomes unknown.  */
4215       peel_iters_epilogue = assumed_vf / 2;
4216       if (dump_enabled_p ())
4217         dump_printf (MSG_NOTE, "cost model: "
4218                      "epilogue peel iters set to vf/2 because "
4219                      "peeling for alignment is unknown.\n");
4220
4221       /* See the same reason above in peel_iters_prologue calculation.  */
4222       epilogue_need_br_taken_cost = true;
4223       epilogue_need_br_not_taken_cost = true;
4224     }
4225   else
4226     {
4227       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4228       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4229         /* If peeled iterations are known but number of scalar loop
4230            iterations are unknown, count a taken branch per peeled loop.  */
4231         epilogue_need_br_taken_cost = true;
4232     }
4233
4234   stmt_info_for_cost *si;
4235   int j;
4236   /* Add costs associated with peel_iters_prologue.  */
4237   if (peel_iters_prologue)
4238     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4239       {
4240         (void) add_stmt_cost (target_cost_data,
4241                               si->count * peel_iters_prologue, si->kind,
4242                               si->stmt_info, si->node, si->vectype,
4243                               si->misalign, vect_prologue);
4244       }
4245
4246   /* Add costs associated with peel_iters_epilogue.  */
4247   if (peel_iters_epilogue)
4248     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4249       {
4250         (void) add_stmt_cost (target_cost_data,
4251                               si->count * peel_iters_epilogue, si->kind,
4252                               si->stmt_info, si->node, si->vectype,
4253                               si->misalign, vect_epilogue);
4254       }
4255
4256   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4257
4258   if (prologue_need_br_taken_cost)
4259     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4260                           vect_prologue);
4261
4262   if (prologue_need_br_not_taken_cost)
4263     (void) add_stmt_cost (target_cost_data, 1,
4264                           cond_branch_not_taken, vect_prologue);
4265
4266   if (epilogue_need_br_taken_cost)
4267     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4268                           vect_epilogue);
4269
4270   if (epilogue_need_br_not_taken_cost)
4271     (void) add_stmt_cost (target_cost_data, 1,
4272                           cond_branch_not_taken, vect_epilogue);
4273
4274   /* Take care of special costs for rgroup controls of partial vectors.  */
4275   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4276     {
4277       /* Calculate how many masks we need to generate.  */
4278       unsigned int num_masks = 0;
4279       rgroup_controls *rgm;
4280       unsigned int num_vectors_m1;
4281       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4282         if (rgm->type)
4283           num_masks += num_vectors_m1 + 1;
4284       gcc_assert (num_masks > 0);
4285
4286       /* In the worst case, we need to generate each mask in the prologue
4287          and in the loop body.  One of the loop body mask instructions
4288          replaces the comparison in the scalar loop, and since we don't
4289          count the scalar comparison against the scalar body, we shouldn't
4290          count that vector instruction against the vector body either.
4291
4292          Sometimes we can use unpacks instead of generating prologue
4293          masks and sometimes the prologue mask will fold to a constant,
4294          so the actual prologue cost might be smaller.  However, it's
4295          simpler and safer to use the worst-case cost; if this ends up
4296          being the tie-breaker between vectorizing or not, then it's
4297          probably better not to vectorize.  */
4298       (void) add_stmt_cost (target_cost_data, num_masks,
4299                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4300                             vect_prologue);
4301       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4302                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4303                             vect_body);
4304     }
4305   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4306     {
4307       /* Referring to the functions vect_set_loop_condition_partial_vectors
4308          and vect_set_loop_controls_directly, we need to generate each
4309          length in the prologue and in the loop body if required. Although
4310          there are some possible optimizations, we consider the worst case
4311          here.  */
4312
4313       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4314       signed char partial_load_store_bias
4315         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4316       bool need_iterate_p
4317         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4318            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4319
4320       /* Calculate how many statements to be added.  */
4321       unsigned int prologue_stmts = 0;
4322       unsigned int body_stmts = 0;
4323
4324       rgroup_controls *rgc;
4325       unsigned int num_vectors_m1;
4326       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4327         if (rgc->type)
4328           {
4329             /* May need one SHIFT for nitems_total computation.  */
4330             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4331             if (nitems != 1 && !niters_known_p)
4332               prologue_stmts += 1;
4333
4334             /* May need one MAX and one MINUS for wrap around.  */
4335             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4336               prologue_stmts += 2;
4337
4338             /* Need one MAX and one MINUS for each batch limit excepting for
4339                the 1st one.  */
4340             prologue_stmts += num_vectors_m1 * 2;
4341
4342             unsigned int num_vectors = num_vectors_m1 + 1;
4343
4344             /* Need to set up lengths in prologue, only one MIN required
4345                for each since start index is zero.  */
4346             prologue_stmts += num_vectors;
4347
4348             /* If we have a non-zero partial load bias, we need one PLUS
4349                to adjust the load length.  */
4350             if (partial_load_store_bias != 0)
4351               body_stmts += 1;
4352
4353             /* Each may need two MINs and one MINUS to update lengths in body
4354                for next iteration.  */
4355             if (need_iterate_p)
4356               body_stmts += 3 * num_vectors;
4357           }
4358
4359       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4360                             scalar_stmt, vect_prologue);
4361       (void) add_stmt_cost (target_cost_data, body_stmts,
4362                             scalar_stmt, vect_body);
4363     }
4364
4365   /* FORNOW: The scalar outside cost is incremented in one of the
4366      following ways:
4367
4368      1. The vectorizer checks for alignment and aliasing and generates
4369      a condition that allows dynamic vectorization.  A cost model
4370      check is ANDED with the versioning condition.  Hence scalar code
4371      path now has the added cost of the versioning check.
4372
4373        if (cost > th & versioning_check)
4374          jmp to vector code
4375
4376      Hence run-time scalar is incremented by not-taken branch cost.
4377
4378      2. The vectorizer then checks if a prologue is required.  If the
4379      cost model check was not done before during versioning, it has to
4380      be done before the prologue check.
4381
4382        if (cost <= th)
4383          prologue = scalar_iters
4384        if (prologue == 0)
4385          jmp to vector code
4386        else
4387          execute prologue
4388        if (prologue == num_iters)
4389          go to exit
4390
4391      Hence the run-time scalar cost is incremented by a taken branch,
4392      plus a not-taken branch, plus a taken branch cost.
4393
4394      3. The vectorizer then checks if an epilogue is required.  If the
4395      cost model check was not done before during prologue check, it
4396      has to be done with the epilogue check.
4397
4398        if (prologue == 0)
4399          jmp to vector code
4400        else
4401          execute prologue
4402        if (prologue == num_iters)
4403          go to exit
4404        vector code:
4405          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4406            jmp to epilogue
4407
4408      Hence the run-time scalar cost should be incremented by 2 taken
4409      branches.
4410
4411      TODO: The back end may reorder the BBS's differently and reverse
4412      conditions/branch directions.  Change the estimates below to
4413      something more reasonable.  */
4414
4415   /* If the number of iterations is known and we do not do versioning, we can
4416      decide whether to vectorize at compile time.  Hence the scalar version
4417      do not carry cost model guard costs.  */
4418   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4419       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4420     {
4421       /* Cost model check occurs at versioning.  */
4422       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4423         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4424       else
4425         {
4426           /* Cost model check occurs at prologue generation.  */
4427           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4428             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4429               + vect_get_stmt_cost (cond_branch_not_taken);
4430           /* Cost model check occurs at epilogue generation.  */
4431           else
4432             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4433         }
4434     }
4435
4436   /* Complete the target-specific cost calculations.  */
4437   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4438                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4439                suggested_unroll_factor);
4440
4441   if (suggested_unroll_factor && *suggested_unroll_factor > 1
4442       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4443       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4444                     *suggested_unroll_factor,
4445                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4446     {
4447       if (dump_enabled_p ())
4448         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4449                          "can't unroll as unrolled vectorization factor larger"
4450                          " than maximum vectorization factor: "
4451                          HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4452                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4453       *suggested_unroll_factor = 1;
4454     }
4455
4456   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4457
4458   if (dump_enabled_p ())
4459     {
4460       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4461       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4462                    vec_inside_cost);
4463       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4464                    vec_prologue_cost);
4465       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4466                    vec_epilogue_cost);
4467       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4468                    scalar_single_iter_cost);
4469       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4470                    scalar_outside_cost);
4471       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4472                    vec_outside_cost);
4473       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4474                    peel_iters_prologue);
4475       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4476                    peel_iters_epilogue);
4477     }
4478
4479   /* Calculate number of iterations required to make the vector version
4480      profitable, relative to the loop bodies only.  The following condition
4481      must hold true:
4482      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4483      where
4484      SIC = scalar iteration cost, VIC = vector iteration cost,
4485      VOC = vector outside cost, VF = vectorization factor,
4486      NPEEL = prologue iterations + epilogue iterations,
4487      SOC = scalar outside cost for run time cost model check.  */
4488
4489   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4490                           - vec_inside_cost);
4491   if (saving_per_viter <= 0)
4492     {
4493       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4494         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4495                     "vectorization did not happen for a simd loop");
4496
4497       if (dump_enabled_p ())
4498         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4499                          "cost model: the vector iteration cost = %d "
4500                          "divided by the scalar iteration cost = %d "
4501                          "is greater or equal to the vectorization factor = %d"
4502                          ".\n",
4503                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4504       *ret_min_profitable_niters = -1;
4505       *ret_min_profitable_estimate = -1;
4506       return;
4507     }
4508
4509   /* ??? The "if" arm is written to handle all cases; see below for what
4510      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4511   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4512     {
4513       /* Rewriting the condition above in terms of the number of
4514          vector iterations (vniters) rather than the number of
4515          scalar iterations (niters) gives:
4516
4517          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4518
4519          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4520
4521          For integer N, X and Y when X > 0:
4522
4523          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4524       int outside_overhead = (vec_outside_cost
4525                               - scalar_single_iter_cost * peel_iters_prologue
4526                               - scalar_single_iter_cost * peel_iters_epilogue
4527                               - scalar_outside_cost);
4528       /* We're only interested in cases that require at least one
4529          vector iteration.  */
4530       int min_vec_niters = 1;
4531       if (outside_overhead > 0)
4532         min_vec_niters = outside_overhead / saving_per_viter + 1;
4533
4534       if (dump_enabled_p ())
4535         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4536                      min_vec_niters);
4537
4538       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4539         {
4540           /* Now that we know the minimum number of vector iterations,
4541              find the minimum niters for which the scalar cost is larger:
4542
4543              SIC * niters > VIC * vniters + VOC - SOC
4544
4545              We know that the minimum niters is no more than
4546              vniters * VF + NPEEL, but it might be (and often is) less
4547              than that if a partial vector iteration is cheaper than the
4548              equivalent scalar code.  */
4549           int threshold = (vec_inside_cost * min_vec_niters
4550                            + vec_outside_cost
4551                            - scalar_outside_cost);
4552           if (threshold <= 0)
4553             min_profitable_iters = 1;
4554           else
4555             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4556         }
4557       else
4558         /* Convert the number of vector iterations into a number of
4559            scalar iterations.  */
4560         min_profitable_iters = (min_vec_niters * assumed_vf
4561                                 + peel_iters_prologue
4562                                 + peel_iters_epilogue);
4563     }
4564   else
4565     {
4566       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4567                               * assumed_vf
4568                               - vec_inside_cost * peel_iters_prologue
4569                               - vec_inside_cost * peel_iters_epilogue);
4570       if (min_profitable_iters <= 0)
4571         min_profitable_iters = 0;
4572       else
4573         {
4574           min_profitable_iters /= saving_per_viter;
4575
4576           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4577               <= (((int) vec_inside_cost * min_profitable_iters)
4578                   + (((int) vec_outside_cost - scalar_outside_cost)
4579                      * assumed_vf)))
4580             min_profitable_iters++;
4581         }
4582     }
4583
4584   if (dump_enabled_p ())
4585     dump_printf (MSG_NOTE,
4586                  "  Calculated minimum iters for profitability: %d\n",
4587                  min_profitable_iters);
4588
4589   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4590       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4591     /* We want the vectorized loop to execute at least once.  */
4592     min_profitable_iters = assumed_vf + peel_iters_prologue;
4593   else if (min_profitable_iters < peel_iters_prologue)
4594     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4595        vectorized loop executes at least once.  */
4596     min_profitable_iters = peel_iters_prologue;
4597
4598   if (dump_enabled_p ())
4599     dump_printf_loc (MSG_NOTE, vect_location,
4600                      "  Runtime profitability threshold = %d\n",
4601                      min_profitable_iters);
4602
4603   *ret_min_profitable_niters = min_profitable_iters;
4604
4605   /* Calculate number of iterations required to make the vector version
4606      profitable, relative to the loop bodies only.
4607
4608      Non-vectorized variant is SIC * niters and it must win over vector
4609      variant on the expected loop trip count.  The following condition must hold true:
4610      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4611
4612   if (vec_outside_cost <= 0)
4613     min_profitable_estimate = 0;
4614   /* ??? This "else if" arm is written to handle all cases; see below for
4615      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4616   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4617     {
4618       /* This is a repeat of the code above, but with + SOC rather
4619          than - SOC.  */
4620       int outside_overhead = (vec_outside_cost
4621                               - scalar_single_iter_cost * peel_iters_prologue
4622                               - scalar_single_iter_cost * peel_iters_epilogue
4623                               + scalar_outside_cost);
4624       int min_vec_niters = 1;
4625       if (outside_overhead > 0)
4626         min_vec_niters = outside_overhead / saving_per_viter + 1;
4627
4628       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4629         {
4630           int threshold = (vec_inside_cost * min_vec_niters
4631                            + vec_outside_cost
4632                            + scalar_outside_cost);
4633           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4634         }
4635       else
4636         min_profitable_estimate = (min_vec_niters * assumed_vf
4637                                    + peel_iters_prologue
4638                                    + peel_iters_epilogue);
4639     }
4640   else
4641     {
4642       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4643                                  * assumed_vf
4644                                  - vec_inside_cost * peel_iters_prologue
4645                                  - vec_inside_cost * peel_iters_epilogue)
4646                                  / ((scalar_single_iter_cost * assumed_vf)
4647                                    - vec_inside_cost);
4648     }
4649   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4650   if (dump_enabled_p ())
4651     dump_printf_loc (MSG_NOTE, vect_location,
4652                      "  Static estimate profitability threshold = %d\n",
4653                      min_profitable_estimate);
4654
4655   *ret_min_profitable_estimate = min_profitable_estimate;
4656 }
4657
4658 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4659    vector elements (not bits) for a vector with NELT elements.  */
4660 static void
4661 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4662                               vec_perm_builder *sel)
4663 {
4664   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4665      by vec_perm_indices.  */
4666   sel->new_vector (nelt, 1, 3);
4667   for (unsigned int i = 0; i < 3; i++)
4668     sel->quick_push (i + offset);
4669 }
4670
4671 /* Checks whether the target supports whole-vector shifts for vectors of mode
4672    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4673    it supports vec_perm_const with masks for all necessary shift amounts.  */
4674 static bool
4675 have_whole_vector_shift (machine_mode mode)
4676 {
4677   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4678     return true;
4679
4680   /* Variable-length vectors should be handled via the optab.  */
4681   unsigned int nelt;
4682   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4683     return false;
4684
4685   vec_perm_builder sel;
4686   vec_perm_indices indices;
4687   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4688     {
4689       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4690       indices.new_vector (sel, 2, nelt);
4691       if (!can_vec_perm_const_p (mode, mode, indices, false))
4692         return false;
4693     }
4694   return true;
4695 }
4696
4697 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
4698    multiplication operands have differing signs and (b) we intend
4699    to emulate the operation using a series of signed DOT_PROD_EXPRs.
4700    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
4701
4702 static bool
4703 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
4704                                  stmt_vec_info stmt_info)
4705 {
4706   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
4707   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
4708     return false;
4709
4710   tree rhs1 = gimple_assign_rhs1 (assign);
4711   tree rhs2 = gimple_assign_rhs2 (assign);
4712   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
4713     return false;
4714
4715   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4716   gcc_assert (reduc_info->is_reduc_info);
4717   return !directly_supported_p (DOT_PROD_EXPR,
4718                                 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
4719                                 optab_vector_mixed_sign);
4720 }
4721
4722 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4723    functions. Design better to avoid maintenance issues.  */
4724
4725 /* Function vect_model_reduction_cost.
4726
4727    Models cost for a reduction operation, including the vector ops
4728    generated within the strip-mine loop in some cases, the initial
4729    definition before the loop, and the epilogue code that must be generated.  */
4730
4731 static void
4732 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4733                            stmt_vec_info stmt_info, internal_fn reduc_fn,
4734                            vect_reduction_type reduction_type,
4735                            int ncopies, stmt_vector_for_cost *cost_vec)
4736 {
4737   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4738   tree vectype;
4739   machine_mode mode;
4740   class loop *loop = NULL;
4741
4742   if (loop_vinfo)
4743     loop = LOOP_VINFO_LOOP (loop_vinfo);
4744
4745   /* Condition reductions generate two reductions in the loop.  */
4746   if (reduction_type == COND_REDUCTION)
4747     ncopies *= 2;
4748
4749   vectype = STMT_VINFO_VECTYPE (stmt_info);
4750   mode = TYPE_MODE (vectype);
4751   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4752
4753   gimple_match_op op;
4754   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
4755     gcc_unreachable ();
4756
4757   bool emulated_mixed_dot_prod
4758     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
4759   if (reduction_type == EXTRACT_LAST_REDUCTION)
4760     /* No extra instructions are needed in the prologue.  The loop body
4761        operations are costed in vectorizable_condition.  */
4762     inside_cost = 0;
4763   else if (reduction_type == FOLD_LEFT_REDUCTION)
4764     {
4765       /* No extra instructions needed in the prologue.  */
4766       prologue_cost = 0;
4767
4768       if (reduc_fn != IFN_LAST)
4769         /* Count one reduction-like operation per vector.  */
4770         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4771                                         stmt_info, 0, vect_body);
4772       else
4773         {
4774           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4775           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4776           inside_cost = record_stmt_cost (cost_vec, nelements,
4777                                           vec_to_scalar, stmt_info, 0,
4778                                           vect_body);
4779           inside_cost += record_stmt_cost (cost_vec, nelements,
4780                                            scalar_stmt, stmt_info, 0,
4781                                            vect_body);
4782         }
4783     }
4784   else
4785     {
4786       /* Add in the cost of the initial definitions.  */
4787       int prologue_stmts;
4788       if (reduction_type == COND_REDUCTION)
4789         /* For cond reductions we have four vectors: initial index, step,
4790            initial result of the data reduction, initial value of the index
4791            reduction.  */
4792         prologue_stmts = 4;
4793       else if (emulated_mixed_dot_prod)
4794         /* We need the initial reduction value and two invariants:
4795            one that contains the minimum signed value and one that
4796            contains half of its negative.  */
4797         prologue_stmts = 3;
4798       else
4799         prologue_stmts = 1;
4800       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4801                                          scalar_to_vec, stmt_info, 0,
4802                                          vect_prologue);
4803     }
4804
4805   /* Determine cost of epilogue code.
4806
4807      We have a reduction operator that will reduce the vector in one statement.
4808      Also requires scalar extract.  */
4809
4810   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4811     {
4812       if (reduc_fn != IFN_LAST)
4813         {
4814           if (reduction_type == COND_REDUCTION)
4815             {
4816               /* An EQ stmt and an COND_EXPR stmt.  */
4817               epilogue_cost += record_stmt_cost (cost_vec, 2,
4818                                                  vector_stmt, stmt_info, 0,
4819                                                  vect_epilogue);
4820               /* Reduction of the max index and a reduction of the found
4821                  values.  */
4822               epilogue_cost += record_stmt_cost (cost_vec, 2,
4823                                                  vec_to_scalar, stmt_info, 0,
4824                                                  vect_epilogue);
4825               /* A broadcast of the max value.  */
4826               epilogue_cost += record_stmt_cost (cost_vec, 1,
4827                                                  scalar_to_vec, stmt_info, 0,
4828                                                  vect_epilogue);
4829             }
4830           else
4831             {
4832               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4833                                                  stmt_info, 0, vect_epilogue);
4834               epilogue_cost += record_stmt_cost (cost_vec, 1,
4835                                                  vec_to_scalar, stmt_info, 0,
4836                                                  vect_epilogue);
4837             }
4838         }
4839       else if (reduction_type == COND_REDUCTION)
4840         {
4841           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4842           /* Extraction of scalar elements.  */
4843           epilogue_cost += record_stmt_cost (cost_vec,
4844                                              2 * estimated_nunits,
4845                                              vec_to_scalar, stmt_info, 0,
4846                                              vect_epilogue);
4847           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4848           epilogue_cost += record_stmt_cost (cost_vec,
4849                                              2 * estimated_nunits - 3,
4850                                              scalar_stmt, stmt_info, 0,
4851                                              vect_epilogue);
4852         }
4853       else if (reduction_type == EXTRACT_LAST_REDUCTION
4854                || reduction_type == FOLD_LEFT_REDUCTION)
4855         /* No extra instructions need in the epilogue.  */
4856         ;
4857       else
4858         {
4859           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4860           tree bitsize = TYPE_SIZE (op.type);
4861           int element_bitsize = tree_to_uhwi (bitsize);
4862           int nelements = vec_size_in_bits / element_bitsize;
4863
4864           if (op.code == COND_EXPR)
4865             op.code = MAX_EXPR;
4866
4867           /* We have a whole vector shift available.  */
4868           if (VECTOR_MODE_P (mode)
4869               && directly_supported_p (op.code, vectype)
4870               && have_whole_vector_shift (mode))
4871             {
4872               /* Final reduction via vector shifts and the reduction operator.
4873                  Also requires scalar extract.  */
4874               epilogue_cost += record_stmt_cost (cost_vec,
4875                                                  exact_log2 (nelements) * 2,
4876                                                  vector_stmt, stmt_info, 0,
4877                                                  vect_epilogue);
4878               epilogue_cost += record_stmt_cost (cost_vec, 1,
4879                                                  vec_to_scalar, stmt_info, 0,
4880                                                  vect_epilogue);
4881             }
4882           else
4883             /* Use extracts and reduction op for final reduction.  For N
4884                elements, we have N extracts and N-1 reduction ops.  */
4885             epilogue_cost += record_stmt_cost (cost_vec,
4886                                                nelements + nelements - 1,
4887                                                vector_stmt, stmt_info, 0,
4888                                                vect_epilogue);
4889         }
4890     }
4891
4892   if (dump_enabled_p ())
4893     dump_printf (MSG_NOTE,
4894                  "vect_model_reduction_cost: inside_cost = %d, "
4895                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4896                  prologue_cost, epilogue_cost);
4897 }
4898
4899 /* SEQ is a sequence of instructions that initialize the reduction
4900    described by REDUC_INFO.  Emit them in the appropriate place.  */
4901
4902 static void
4903 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4904                                 stmt_vec_info reduc_info, gimple *seq)
4905 {
4906   if (reduc_info->reused_accumulator)
4907     {
4908       /* When reusing an accumulator from the main loop, we only need
4909          initialization instructions if the main loop can be skipped.
4910          In that case, emit the initialization instructions at the end
4911          of the guard block that does the skip.  */
4912       edge skip_edge = loop_vinfo->skip_main_loop_edge;
4913       gcc_assert (skip_edge);
4914       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4915       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4916     }
4917   else
4918     {
4919       /* The normal case: emit the initialization instructions on the
4920          preheader edge.  */
4921       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4922       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4923     }
4924 }
4925
4926 /* Function get_initial_def_for_reduction
4927
4928    Input:
4929    REDUC_INFO - the info_for_reduction
4930    INIT_VAL - the initial value of the reduction variable
4931    NEUTRAL_OP - a value that has no effect on the reduction, as per
4932                 neutral_op_for_reduction
4933
4934    Output:
4935    Return a vector variable, initialized according to the operation that
4936         STMT_VINFO performs. This vector will be used as the initial value
4937         of the vector of partial results.
4938
4939    The value we need is a vector in which element 0 has value INIT_VAL
4940    and every other element has value NEUTRAL_OP.  */
4941
4942 static tree
4943 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4944                                stmt_vec_info reduc_info,
4945                                tree init_val, tree neutral_op)
4946 {
4947   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4948   tree scalar_type = TREE_TYPE (init_val);
4949   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4950   tree init_def;
4951   gimple_seq stmts = NULL;
4952
4953   gcc_assert (vectype);
4954
4955   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4956               || SCALAR_FLOAT_TYPE_P (scalar_type));
4957
4958   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
4959               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
4960
4961   if (operand_equal_p (init_val, neutral_op))
4962     {
4963       /* If both elements are equal then the vector described above is
4964          just a splat.  */
4965       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4966       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
4967     }
4968   else
4969     {
4970       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4971       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4972       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4973         {
4974           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4975              element 0.  */
4976           init_def = gimple_build_vector_from_val (&stmts, vectype,
4977                                                    neutral_op);
4978           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4979                                    vectype, init_def, init_val);
4980         }
4981       else
4982         {
4983           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
4984           tree_vector_builder elts (vectype, 1, 2);
4985           elts.quick_push (init_val);
4986           elts.quick_push (neutral_op);
4987           init_def = gimple_build_vector (&stmts, &elts);
4988         }
4989     }
4990
4991   if (stmts)
4992     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
4993   return init_def;
4994 }
4995
4996 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4997    which performs a reduction involving GROUP_SIZE scalar statements.
4998    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
4999    is nonnull, introducing extra elements of that value will not change the
5000    result.  */
5001
5002 static void
5003 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5004                                 stmt_vec_info reduc_info,
5005                                 vec<tree> *vec_oprnds,
5006                                 unsigned int number_of_vectors,
5007                                 unsigned int group_size, tree neutral_op)
5008 {
5009   vec<tree> &initial_values = reduc_info->reduc_initial_values;
5010   unsigned HOST_WIDE_INT nunits;
5011   unsigned j, number_of_places_left_in_vector;
5012   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5013   unsigned int i;
5014
5015   gcc_assert (group_size == initial_values.length () || neutral_op);
5016
5017   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5018      created vectors. It is greater than 1 if unrolling is performed.
5019
5020      For example, we have two scalar operands, s1 and s2 (e.g., group of
5021      strided accesses of size two), while NUNITS is four (i.e., four scalars
5022      of this type can be packed in a vector).  The output vector will contain
5023      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
5024      will be 2).
5025
5026      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5027      vectors containing the operands.
5028
5029      For example, NUNITS is four as before, and the group size is 8
5030      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
5031      {s5, s6, s7, s8}.  */
5032
5033   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5034     nunits = group_size;
5035
5036   number_of_places_left_in_vector = nunits;
5037   bool constant_p = true;
5038   tree_vector_builder elts (vector_type, nunits, 1);
5039   elts.quick_grow (nunits);
5040   gimple_seq ctor_seq = NULL;
5041   for (j = 0; j < nunits * number_of_vectors; ++j)
5042     {
5043       tree op;
5044       i = j % group_size;
5045
5046       /* Get the def before the loop.  In reduction chain we have only
5047          one initial value.  Else we have as many as PHIs in the group.  */
5048       if (i >= initial_values.length () || (j > i && neutral_op))
5049         op = neutral_op;
5050       else
5051         op = initial_values[i];
5052
5053       /* Create 'vect_ = {op0,op1,...,opn}'.  */
5054       number_of_places_left_in_vector--;
5055       elts[nunits - number_of_places_left_in_vector - 1] = op;
5056       if (!CONSTANT_CLASS_P (op))
5057         constant_p = false;
5058
5059       if (number_of_places_left_in_vector == 0)
5060         {
5061           tree init;
5062           if (constant_p && !neutral_op
5063               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5064               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5065             /* Build the vector directly from ELTS.  */
5066             init = gimple_build_vector (&ctor_seq, &elts);
5067           else if (neutral_op)
5068             {
5069               /* Build a vector of the neutral value and shift the
5070                  other elements into place.  */
5071               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5072                                                    neutral_op);
5073               int k = nunits;
5074               while (k > 0 && elts[k - 1] == neutral_op)
5075                 k -= 1;
5076               while (k > 0)
5077                 {
5078                   k -= 1;
5079                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5080                                        vector_type, init, elts[k]);
5081                 }
5082             }
5083           else
5084             {
5085               /* First time round, duplicate ELTS to fill the
5086                  required number of vectors.  */
5087               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5088                                         elts, number_of_vectors, *vec_oprnds);
5089               break;
5090             }
5091           vec_oprnds->quick_push (init);
5092
5093           number_of_places_left_in_vector = nunits;
5094           elts.new_vector (vector_type, nunits, 1);
5095           elts.quick_grow (nunits);
5096           constant_p = true;
5097         }
5098     }
5099   if (ctor_seq != NULL)
5100     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5101 }
5102
5103 /* For a statement STMT_INFO taking part in a reduction operation return
5104    the stmt_vec_info the meta information is stored on.  */
5105
5106 stmt_vec_info
5107 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5108 {
5109   stmt_info = vect_orig_stmt (stmt_info);
5110   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5111   if (!is_a <gphi *> (stmt_info->stmt)
5112       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5113     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5114   gphi *phi = as_a <gphi *> (stmt_info->stmt);
5115   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5116     {
5117       if (gimple_phi_num_args (phi) == 1)
5118         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5119     }
5120   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5121     {
5122       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5123       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5124         stmt_info = info;
5125     }
5126   return stmt_info;
5127 }
5128
5129 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5130    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
5131    return false.  */
5132
5133 static bool
5134 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5135                                 stmt_vec_info reduc_info)
5136 {
5137   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5138   if (!main_loop_vinfo)
5139     return false;
5140
5141   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5142     return false;
5143
5144   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5145   auto_vec<tree, 16> main_loop_results (num_phis);
5146   auto_vec<tree, 16> initial_values (num_phis);
5147   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5148     {
5149       /* The epilogue loop can be entered either from the main loop or
5150          from an earlier guard block.  */
5151       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5152       for (tree incoming_value : reduc_info->reduc_initial_values)
5153         {
5154           /* Look for:
5155
5156                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5157                                     INITIAL_VALUE(guard block)>.  */
5158           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5159
5160           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5161           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5162
5163           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5164           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5165
5166           main_loop_results.quick_push (from_main_loop);
5167           initial_values.quick_push (from_skip);
5168         }
5169     }
5170   else
5171     /* The main loop dominates the epilogue loop.  */
5172     main_loop_results.splice (reduc_info->reduc_initial_values);
5173
5174   /* See if the main loop has the kind of accumulator we need.  */
5175   vect_reusable_accumulator *accumulator
5176     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5177   if (!accumulator
5178       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5179       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5180                       accumulator->reduc_info->reduc_scalar_results.begin ()))
5181     return false;
5182
5183   /* Handle the case where we can reduce wider vectors to narrower ones.  */
5184   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5185   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5186   unsigned HOST_WIDE_INT m;
5187   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5188                             TYPE_VECTOR_SUBPARTS (vectype), &m))
5189     return false;
5190   /* Check the intermediate vector types and operations are available.  */
5191   tree prev_vectype = old_vectype;
5192   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5193   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5194     {
5195       intermediate_nunits = exact_div (intermediate_nunits, 2);
5196       tree intermediate_vectype = get_related_vectype_for_scalar_type
5197         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5198       if (!intermediate_vectype
5199           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5200                                     intermediate_vectype)
5201           || !can_vec_extract (TYPE_MODE (prev_vectype),
5202                                TYPE_MODE (intermediate_vectype)))
5203         return false;
5204       prev_vectype = intermediate_vectype;
5205     }
5206
5207   /* Non-SLP reductions might apply an adjustment after the reduction
5208      operation, in order to simplify the initialization of the accumulator.
5209      If the epilogue loop carries on from where the main loop left off,
5210      it should apply the same adjustment to the final reduction result.
5211
5212      If the epilogue loop can also be entered directly (rather than via
5213      the main loop), we need to be able to handle that case in the same way,
5214      with the same adjustment.  (In principle we could add a PHI node
5215      to select the correct adjustment, but in practice that shouldn't be
5216      necessary.)  */
5217   tree main_adjustment
5218     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5219   if (loop_vinfo->main_loop_edge && main_adjustment)
5220     {
5221       gcc_assert (num_phis == 1);
5222       tree initial_value = initial_values[0];
5223       /* Check that we can use INITIAL_VALUE as the adjustment and
5224          initialize the accumulator with a neutral value instead.  */
5225       if (!operand_equal_p (initial_value, main_adjustment))
5226         return false;
5227       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5228       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5229                                                     code, initial_value);
5230     }
5231   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5232   reduc_info->reduc_initial_values.truncate (0);
5233   reduc_info->reduc_initial_values.splice (initial_values);
5234   reduc_info->reused_accumulator = accumulator;
5235   return true;
5236 }
5237
5238 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5239    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5240
5241 static tree
5242 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5243                             gimple_seq *seq)
5244 {
5245   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5246   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5247   tree stype = TREE_TYPE (vectype);
5248   tree new_temp = vec_def;
5249   while (nunits > nunits1)
5250     {
5251       nunits /= 2;
5252       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5253                                                            stype, nunits);
5254       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5255
5256       /* The target has to make sure we support lowpart/highpart
5257          extraction, either via direct vector extract or through
5258          an integer mode punning.  */
5259       tree dst1, dst2;
5260       gimple *epilog_stmt;
5261       if (convert_optab_handler (vec_extract_optab,
5262                                  TYPE_MODE (TREE_TYPE (new_temp)),
5263                                  TYPE_MODE (vectype1))
5264           != CODE_FOR_nothing)
5265         {
5266           /* Extract sub-vectors directly once vec_extract becomes
5267              a conversion optab.  */
5268           dst1 = make_ssa_name (vectype1);
5269           epilog_stmt
5270               = gimple_build_assign (dst1, BIT_FIELD_REF,
5271                                      build3 (BIT_FIELD_REF, vectype1,
5272                                              new_temp, TYPE_SIZE (vectype1),
5273                                              bitsize_int (0)));
5274           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5275           dst2 =  make_ssa_name (vectype1);
5276           epilog_stmt
5277               = gimple_build_assign (dst2, BIT_FIELD_REF,
5278                                      build3 (BIT_FIELD_REF, vectype1,
5279                                              new_temp, TYPE_SIZE (vectype1),
5280                                              bitsize_int (bitsize)));
5281           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5282         }
5283       else
5284         {
5285           /* Extract via punning to appropriately sized integer mode
5286              vector.  */
5287           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5288           tree etype = build_vector_type (eltype, 2);
5289           gcc_assert (convert_optab_handler (vec_extract_optab,
5290                                              TYPE_MODE (etype),
5291                                              TYPE_MODE (eltype))
5292                       != CODE_FOR_nothing);
5293           tree tem = make_ssa_name (etype);
5294           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5295                                              build1 (VIEW_CONVERT_EXPR,
5296                                                      etype, new_temp));
5297           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5298           new_temp = tem;
5299           tem = make_ssa_name (eltype);
5300           epilog_stmt
5301               = gimple_build_assign (tem, BIT_FIELD_REF,
5302                                      build3 (BIT_FIELD_REF, eltype,
5303                                              new_temp, TYPE_SIZE (eltype),
5304                                              bitsize_int (0)));
5305           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5306           dst1 = make_ssa_name (vectype1);
5307           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5308                                              build1 (VIEW_CONVERT_EXPR,
5309                                                      vectype1, tem));
5310           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5311           tem = make_ssa_name (eltype);
5312           epilog_stmt
5313               = gimple_build_assign (tem, BIT_FIELD_REF,
5314                                      build3 (BIT_FIELD_REF, eltype,
5315                                              new_temp, TYPE_SIZE (eltype),
5316                                              bitsize_int (bitsize)));
5317           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5318           dst2 =  make_ssa_name (vectype1);
5319           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5320                                              build1 (VIEW_CONVERT_EXPR,
5321                                                      vectype1, tem));
5322           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5323         }
5324
5325       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5326     }
5327
5328   return new_temp;
5329 }
5330
5331 /* Function vect_create_epilog_for_reduction
5332
5333    Create code at the loop-epilog to finalize the result of a reduction
5334    computation.
5335
5336    STMT_INFO is the scalar reduction stmt that is being vectorized.
5337    SLP_NODE is an SLP node containing a group of reduction statements. The
5338      first one in this group is STMT_INFO.
5339    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5340    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5341      (counting from 0)
5342
5343    This function:
5344    1. Completes the reduction def-use cycles.
5345    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5346       by calling the function specified by REDUC_FN if available, or by
5347       other means (whole-vector shifts or a scalar loop).
5348       The function also creates a new phi node at the loop exit to preserve
5349       loop-closed form, as illustrated below.
5350
5351      The flow at the entry to this function:
5352
5353         loop:
5354           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5355           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5356           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5357         loop_exit:
5358           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5359           use <s_out0>
5360           use <s_out0>
5361
5362      The above is transformed by this function into:
5363
5364         loop:
5365           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5366           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5367           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5368         loop_exit:
5369           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5370           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5371           v_out2 = reduce <v_out1>
5372           s_out3 = extract_field <v_out2, 0>
5373           s_out4 = adjust_result <s_out3>
5374           use <s_out4>
5375           use <s_out4>
5376 */
5377
5378 static void
5379 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5380                                   stmt_vec_info stmt_info,
5381                                   slp_tree slp_node,
5382                                   slp_instance slp_node_instance)
5383 {
5384   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5385   gcc_assert (reduc_info->is_reduc_info);
5386   /* For double reductions we need to get at the inner loop reduction
5387      stmt which has the meta info attached.  Our stmt_info is that of the
5388      loop-closed PHI of the inner loop which we remember as
5389      def for the reduction PHI generation.  */
5390   bool double_reduc = false;
5391   stmt_vec_info rdef_info = stmt_info;
5392   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5393     {
5394       gcc_assert (!slp_node);
5395       double_reduc = true;
5396       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5397                                             (stmt_info->stmt, 0));
5398       stmt_info = vect_stmt_to_vectorize (stmt_info);
5399     }
5400   gphi *reduc_def_stmt
5401     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5402   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5403   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5404   tree vectype;
5405   machine_mode mode;
5406   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5407   basic_block exit_bb;
5408   tree scalar_dest;
5409   tree scalar_type;
5410   gimple *new_phi = NULL, *phi;
5411   gimple_stmt_iterator exit_gsi;
5412   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5413   gimple *epilog_stmt = NULL;
5414   gimple *exit_phi;
5415   tree bitsize;
5416   tree def;
5417   tree orig_name, scalar_result;
5418   imm_use_iterator imm_iter, phi_imm_iter;
5419   use_operand_p use_p, phi_use_p;
5420   gimple *use_stmt;
5421   auto_vec<tree> reduc_inputs;
5422   int j, i;
5423   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5424   unsigned int group_size = 1, k;
5425   auto_vec<gimple *> phis;
5426   /* SLP reduction without reduction chain, e.g.,
5427      # a1 = phi <a2, a0>
5428      # b1 = phi <b2, b0>
5429      a2 = operation (a1)
5430      b2 = operation (b1)  */
5431   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5432   bool direct_slp_reduc;
5433   tree induction_index = NULL_TREE;
5434
5435   if (slp_node)
5436     group_size = SLP_TREE_LANES (slp_node);
5437
5438   if (nested_in_vect_loop_p (loop, stmt_info))
5439     {
5440       outer_loop = loop;
5441       loop = loop->inner;
5442       gcc_assert (!slp_node && double_reduc);
5443     }
5444
5445   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5446   gcc_assert (vectype);
5447   mode = TYPE_MODE (vectype);
5448
5449   tree induc_val = NULL_TREE;
5450   tree adjustment_def = NULL;
5451   if (slp_node)
5452     ;
5453   else
5454     {
5455       /* Optimize: for induction condition reduction, if we can't use zero
5456          for induc_val, use initial_def.  */
5457       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5458         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5459       else if (double_reduc)
5460         ;
5461       else
5462         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5463     }
5464
5465   stmt_vec_info single_live_out_stmt[] = { stmt_info };
5466   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5467   if (slp_reduc)
5468     /* All statements produce live-out values.  */
5469     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5470   else if (slp_node)
5471     {
5472       /* The last statement in the reduction chain produces the live-out
5473          value.  Note SLP optimization can shuffle scalar stmts to
5474          optimize permutations so we have to search for the last stmt.  */
5475       for (k = 0; k < group_size; ++k)
5476         if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5477           {
5478             single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5479             break;
5480           }
5481     }
5482
5483   unsigned vec_num;
5484   int ncopies;
5485   if (slp_node)
5486     {
5487       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5488       ncopies = 1;
5489     }
5490   else
5491     {
5492       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5493       vec_num = 1;
5494       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5495     }
5496
5497   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5498      which is updated with the current index of the loop for every match of
5499      the original loop's cond_expr (VEC_STMT).  This results in a vector
5500      containing the last time the condition passed for that vector lane.
5501      The first match will be a 1 to allow 0 to be used for non-matching
5502      indexes.  If there are no matches at all then the vector will be all
5503      zeroes.
5504
5505      PR92772: This algorithm is broken for architectures that support
5506      masked vectors, but do not provide fold_extract_last.  */
5507   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5508     {
5509       auto_vec<std::pair<tree, bool>, 2> ccompares;
5510       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5511       cond_info = vect_stmt_to_vectorize (cond_info);
5512       while (cond_info != reduc_info)
5513         {
5514           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5515             {
5516               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5517               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5518               ccompares.safe_push
5519                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5520                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
5521             }
5522           cond_info
5523             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5524                                                  1 + STMT_VINFO_REDUC_IDX
5525                                                         (cond_info)));
5526           cond_info = vect_stmt_to_vectorize (cond_info);
5527         }
5528       gcc_assert (ccompares.length () != 0);
5529
5530       tree indx_before_incr, indx_after_incr;
5531       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5532       int scalar_precision
5533         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5534       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5535       tree cr_index_vector_type = get_related_vectype_for_scalar_type
5536         (TYPE_MODE (vectype), cr_index_scalar_type,
5537          TYPE_VECTOR_SUBPARTS (vectype));
5538
5539       /* First we create a simple vector induction variable which starts
5540          with the values {1,2,3,...} (SERIES_VECT) and increments by the
5541          vector size (STEP).  */
5542
5543       /* Create a {1,2,3,...} vector.  */
5544       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5545
5546       /* Create a vector of the step value.  */
5547       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5548       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5549
5550       /* Create an induction variable.  */
5551       gimple_stmt_iterator incr_gsi;
5552       bool insert_after;
5553       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5554       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5555                  insert_after, &indx_before_incr, &indx_after_incr);
5556
5557       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5558          filled with zeros (VEC_ZERO).  */
5559
5560       /* Create a vector of 0s.  */
5561       tree zero = build_zero_cst (cr_index_scalar_type);
5562       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5563
5564       /* Create a vector phi node.  */
5565       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5566       new_phi = create_phi_node (new_phi_tree, loop->header);
5567       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5568                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
5569
5570       /* Now take the condition from the loops original cond_exprs
5571          and produce a new cond_exprs (INDEX_COND_EXPR) which for
5572          every match uses values from the induction variable
5573          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5574          (NEW_PHI_TREE).
5575          Finally, we update the phi (NEW_PHI_TREE) to take the value of
5576          the new cond_expr (INDEX_COND_EXPR).  */
5577       gimple_seq stmts = NULL;
5578       for (int i = ccompares.length () - 1; i != -1; --i)
5579         {
5580           tree ccompare = ccompares[i].first;
5581           if (ccompares[i].second)
5582             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5583                                          cr_index_vector_type,
5584                                          ccompare,
5585                                          indx_before_incr, new_phi_tree);
5586           else
5587             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5588                                          cr_index_vector_type,
5589                                          ccompare,
5590                                          new_phi_tree, indx_before_incr);
5591         }
5592       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5593
5594       /* Update the phi with the vec cond.  */
5595       induction_index = new_phi_tree;
5596       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5597                    loop_latch_edge (loop), UNKNOWN_LOCATION);
5598     }
5599
5600   /* 2. Create epilog code.
5601         The reduction epilog code operates across the elements of the vector
5602         of partial results computed by the vectorized loop.
5603         The reduction epilog code consists of:
5604
5605         step 1: compute the scalar result in a vector (v_out2)
5606         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5607         step 3: adjust the scalar result (s_out3) if needed.
5608
5609         Step 1 can be accomplished using one the following three schemes:
5610           (scheme 1) using reduc_fn, if available.
5611           (scheme 2) using whole-vector shifts, if available.
5612           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5613                      combined.
5614
5615           The overall epilog code looks like this:
5616
5617           s_out0 = phi <s_loop>         # original EXIT_PHI
5618           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5619           v_out2 = reduce <v_out1>              # step 1
5620           s_out3 = extract_field <v_out2, 0>    # step 2
5621           s_out4 = adjust_result <s_out3>       # step 3
5622
5623           (step 3 is optional, and steps 1 and 2 may be combined).
5624           Lastly, the uses of s_out0 are replaced by s_out4.  */
5625
5626
5627   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5628          v_out1 = phi <VECT_DEF>
5629          Store them in NEW_PHIS.  */
5630   if (double_reduc)
5631     loop = outer_loop;
5632   exit_bb = single_exit (loop)->dest;
5633   exit_gsi = gsi_after_labels (exit_bb);
5634   reduc_inputs.create (slp_node ? vec_num : ncopies);
5635   for (unsigned i = 0; i < vec_num; i++)
5636     {
5637       gimple_seq stmts = NULL;
5638       if (slp_node)
5639         def = vect_get_slp_vect_def (slp_node, i);
5640       else
5641         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5642       for (j = 0; j < ncopies; j++)
5643         {
5644           tree new_def = copy_ssa_name (def);
5645           phi = create_phi_node (new_def, exit_bb);
5646           if (j)
5647             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5648           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5649           new_def = gimple_convert (&stmts, vectype, new_def);
5650           reduc_inputs.quick_push (new_def);
5651         }
5652       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5653     }
5654
5655   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5656          (i.e. when reduc_fn is not available) and in the final adjustment
5657          code (if needed).  Also get the original scalar reduction variable as
5658          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5659          represents a reduction pattern), the tree-code and scalar-def are
5660          taken from the original stmt that the pattern-stmt (STMT) replaces.
5661          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5662          are taken from STMT.  */
5663
5664   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5665   if (orig_stmt_info != stmt_info)
5666     {
5667       /* Reduction pattern  */
5668       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5669       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5670     }
5671
5672   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
5673   scalar_type = TREE_TYPE (scalar_dest);
5674   scalar_results.truncate (0);
5675   scalar_results.reserve_exact (group_size);
5676   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5677   bitsize = TYPE_SIZE (scalar_type);
5678
5679   /* True if we should implement SLP_REDUC using native reduction operations
5680      instead of scalar operations.  */
5681   direct_slp_reduc = (reduc_fn != IFN_LAST
5682                       && slp_reduc
5683                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5684
5685   /* In case of reduction chain, e.g.,
5686      # a1 = phi <a3, a0>
5687      a2 = operation (a1)
5688      a3 = operation (a2),
5689
5690      we may end up with more than one vector result.  Here we reduce them
5691      to one vector.
5692
5693      The same is true for a SLP reduction, e.g.,
5694      # a1 = phi <a2, a0>
5695      # b1 = phi <b2, b0>
5696      a2 = operation (a1)
5697      b2 = operation (a2),
5698
5699      where we can end up with more than one vector as well.  We can
5700      easily accumulate vectors when the number of vector elements is
5701      a multiple of the SLP group size.
5702
5703      The same is true if we couldn't use a single defuse cycle.  */
5704   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
5705       || direct_slp_reduc
5706       || (slp_reduc
5707           && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
5708       || ncopies > 1)
5709     {
5710       gimple_seq stmts = NULL;
5711       tree single_input = reduc_inputs[0];
5712       for (k = 1; k < reduc_inputs.length (); k++)
5713         single_input = gimple_build (&stmts, code, vectype,
5714                                      single_input, reduc_inputs[k]);
5715       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5716
5717       reduc_inputs.truncate (0);
5718       reduc_inputs.safe_push (single_input);
5719     }
5720
5721   tree orig_reduc_input = reduc_inputs[0];
5722
5723   /* If this loop is an epilogue loop that can be skipped after the
5724      main loop, we can only share a reduction operation between the
5725      main loop and the epilogue if we put it at the target of the
5726      skip edge.
5727
5728      We can still reuse accumulators if this check fails.  Doing so has
5729      the minor(?) benefit of making the epilogue loop's scalar result
5730      independent of the main loop's scalar result.  */
5731   bool unify_with_main_loop_p = false;
5732   if (reduc_info->reused_accumulator
5733       && loop_vinfo->skip_this_loop_edge
5734       && single_succ_p (exit_bb)
5735       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5736     {
5737       unify_with_main_loop_p = true;
5738
5739       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5740       reduc_inputs[0] = make_ssa_name (vectype);
5741       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5742       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5743                    UNKNOWN_LOCATION);
5744       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
5745                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5746       exit_gsi = gsi_after_labels (reduc_block);
5747     }
5748
5749   /* Shouldn't be used beyond this point.  */
5750   exit_bb = nullptr;
5751
5752   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5753       && reduc_fn != IFN_LAST)
5754     {
5755       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5756          various data values where the condition matched and another vector
5757          (INDUCTION_INDEX) containing all the indexes of those matches.  We
5758          need to extract the last matching index (which will be the index with
5759          highest value) and use this to index into the data vector.
5760          For the case where there were no matches, the data vector will contain
5761          all default values and the index vector will be all zeros.  */
5762
5763       /* Get various versions of the type of the vector of indexes.  */
5764       tree index_vec_type = TREE_TYPE (induction_index);
5765       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5766       tree index_scalar_type = TREE_TYPE (index_vec_type);
5767       tree index_vec_cmp_type = truth_type_for (index_vec_type);
5768
5769       /* Get an unsigned integer version of the type of the data vector.  */
5770       int scalar_precision
5771         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5772       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5773       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5774                                                 vectype);
5775
5776       /* First we need to create a vector (ZERO_VEC) of zeros and another
5777          vector (MAX_INDEX_VEC) filled with the last matching index, which we
5778          can create using a MAX reduction and then expanding.
5779          In the case where the loop never made any matches, the max index will
5780          be zero.  */
5781
5782       /* Vector of {0, 0, 0,...}.  */
5783       tree zero_vec = build_zero_cst (vectype);
5784
5785       /* Find maximum value from the vector of found indexes.  */
5786       tree max_index = make_ssa_name (index_scalar_type);
5787       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5788                                                           1, induction_index);
5789       gimple_call_set_lhs (max_index_stmt, max_index);
5790       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5791
5792       /* Vector of {max_index, max_index, max_index,...}.  */
5793       tree max_index_vec = make_ssa_name (index_vec_type);
5794       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5795                                                       max_index);
5796       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5797                                                         max_index_vec_rhs);
5798       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5799
5800       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5801          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5802          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5803          otherwise.  Only one value should match, resulting in a vector
5804          (VEC_COND) with one data value and the rest zeros.
5805          In the case where the loop never made any matches, every index will
5806          match, resulting in a vector with all data values (which will all be
5807          the default value).  */
5808
5809       /* Compare the max index vector to the vector of found indexes to find
5810          the position of the max value.  */
5811       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5812       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5813                                                       induction_index,
5814                                                       max_index_vec);
5815       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5816
5817       /* Use the compare to choose either values from the data vector or
5818          zero.  */
5819       tree vec_cond = make_ssa_name (vectype);
5820       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5821                                                    vec_compare,
5822                                                    reduc_inputs[0],
5823                                                    zero_vec);
5824       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5825
5826       /* Finally we need to extract the data value from the vector (VEC_COND)
5827          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5828          reduction, but because this doesn't exist, we can use a MAX reduction
5829          instead.  The data value might be signed or a float so we need to cast
5830          it first.
5831          In the case where the loop never made any matches, the data values are
5832          all identical, and so will reduce down correctly.  */
5833
5834       /* Make the matched data values unsigned.  */
5835       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5836       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5837                                        vec_cond);
5838       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5839                                                         VIEW_CONVERT_EXPR,
5840                                                         vec_cond_cast_rhs);
5841       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5842
5843       /* Reduce down to a scalar value.  */
5844       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5845       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5846                                                            1, vec_cond_cast);
5847       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5848       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5849
5850       /* Convert the reduced value back to the result type and set as the
5851          result.  */
5852       gimple_seq stmts = NULL;
5853       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5854                                data_reduc);
5855       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5856       scalar_results.safe_push (new_temp);
5857     }
5858   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5859            && reduc_fn == IFN_LAST)
5860     {
5861       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5862          idx = 0;
5863          idx_val = induction_index[0];
5864          val = data_reduc[0];
5865          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5866            if (induction_index[i] > idx_val)
5867              val = data_reduc[i], idx_val = induction_index[i];
5868          return val;  */
5869
5870       tree data_eltype = TREE_TYPE (vectype);
5871       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5872       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5873       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5874       /* Enforced by vectorizable_reduction, which ensures we have target
5875          support before allowing a conditional reduction on variable-length
5876          vectors.  */
5877       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5878       tree idx_val = NULL_TREE, val = NULL_TREE;
5879       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5880         {
5881           tree old_idx_val = idx_val;
5882           tree old_val = val;
5883           idx_val = make_ssa_name (idx_eltype);
5884           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5885                                              build3 (BIT_FIELD_REF, idx_eltype,
5886                                                      induction_index,
5887                                                      bitsize_int (el_size),
5888                                                      bitsize_int (off)));
5889           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5890           val = make_ssa_name (data_eltype);
5891           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5892                                              build3 (BIT_FIELD_REF,
5893                                                      data_eltype,
5894                                                      reduc_inputs[0],
5895                                                      bitsize_int (el_size),
5896                                                      bitsize_int (off)));
5897           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5898           if (off != 0)
5899             {
5900               tree new_idx_val = idx_val;
5901               if (off != v_size - el_size)
5902                 {
5903                   new_idx_val = make_ssa_name (idx_eltype);
5904                   epilog_stmt = gimple_build_assign (new_idx_val,
5905                                                      MAX_EXPR, idx_val,
5906                                                      old_idx_val);
5907                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5908                 }
5909               tree cond = make_ssa_name (boolean_type_node);
5910               epilog_stmt = gimple_build_assign (cond, GT_EXPR,
5911                                                  idx_val, old_idx_val);
5912               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5913               tree new_val = make_ssa_name (data_eltype);
5914               epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
5915                                                  cond, val, old_val);
5916               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5917               idx_val = new_idx_val;
5918               val = new_val;
5919             }
5920         }
5921       /* Convert the reduced value back to the result type and set as the
5922          result.  */
5923       gimple_seq stmts = NULL;
5924       val = gimple_convert (&stmts, scalar_type, val);
5925       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5926       scalar_results.safe_push (val);
5927     }
5928
5929   /* 2.3 Create the reduction code, using one of the three schemes described
5930          above. In SLP we simply need to extract all the elements from the
5931          vector (without reducing them), so we use scalar shifts.  */
5932   else if (reduc_fn != IFN_LAST && !slp_reduc)
5933     {
5934       tree tmp;
5935       tree vec_elem_type;
5936
5937       /* Case 1:  Create:
5938          v_out2 = reduc_expr <v_out1>  */
5939
5940       if (dump_enabled_p ())
5941         dump_printf_loc (MSG_NOTE, vect_location,
5942                          "Reduce using direct vector reduction.\n");
5943
5944       gimple_seq stmts = NULL;
5945       vec_elem_type = TREE_TYPE (vectype);
5946       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5947                                vec_elem_type, reduc_inputs[0]);
5948       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5949       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5950
5951       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5952           && induc_val)
5953         {
5954           /* Earlier we set the initial value to be a vector if induc_val
5955              values.  Check the result and if it is induc_val then replace
5956              with the original initial value, unless induc_val is
5957              the same as initial_def already.  */
5958           tree zcompare = make_ssa_name (boolean_type_node);
5959           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
5960                                              new_temp, induc_val);
5961           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5962           tree initial_def = reduc_info->reduc_initial_values[0];
5963           tmp = make_ssa_name (new_scalar_dest);
5964           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5965                                              initial_def, new_temp);
5966           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5967           new_temp = tmp;
5968         }
5969
5970       scalar_results.safe_push (new_temp);
5971     }
5972   else if (direct_slp_reduc)
5973     {
5974       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5975          with the elements for other SLP statements replaced with the
5976          neutral value.  We can then do a normal reduction on each vector.  */
5977
5978       /* Enforced by vectorizable_reduction.  */
5979       gcc_assert (reduc_inputs.length () == 1);
5980       gcc_assert (pow2p_hwi (group_size));
5981
5982       gimple_seq seq = NULL;
5983
5984       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5985          and the same element size as VECTYPE.  */
5986       tree index = build_index_vector (vectype, 0, 1);
5987       tree index_type = TREE_TYPE (index);
5988       tree index_elt_type = TREE_TYPE (index_type);
5989       tree mask_type = truth_type_for (index_type);
5990
5991       /* Create a vector that, for each element, identifies which of
5992          the REDUC_GROUP_SIZE results should use it.  */
5993       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5994       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5995                             build_vector_from_val (index_type, index_mask));
5996
5997       /* Get a neutral vector value.  This is simply a splat of the neutral
5998          scalar value if we have one, otherwise the initial scalar value
5999          is itself a neutral value.  */
6000       tree vector_identity = NULL_TREE;
6001       tree neutral_op = NULL_TREE;
6002       if (slp_node)
6003         {
6004           tree initial_value = NULL_TREE;
6005           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6006             initial_value = reduc_info->reduc_initial_values[0];
6007           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6008                                                  initial_value);
6009         }
6010       if (neutral_op)
6011         vector_identity = gimple_build_vector_from_val (&seq, vectype,
6012                                                         neutral_op);
6013       for (unsigned int i = 0; i < group_size; ++i)
6014         {
6015           /* If there's no univeral neutral value, we can use the
6016              initial scalar value from the original PHI.  This is used
6017              for MIN and MAX reduction, for example.  */
6018           if (!neutral_op)
6019             {
6020               tree scalar_value = reduc_info->reduc_initial_values[i];
6021               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6022                                              scalar_value);
6023               vector_identity = gimple_build_vector_from_val (&seq, vectype,
6024                                                               scalar_value);
6025             }
6026
6027           /* Calculate the equivalent of:
6028
6029              sel[j] = (index[j] == i);
6030
6031              which selects the elements of REDUC_INPUTS[0] that should
6032              be included in the result.  */
6033           tree compare_val = build_int_cst (index_elt_type, i);
6034           compare_val = build_vector_from_val (index_type, compare_val);
6035           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6036                                    index, compare_val);
6037
6038           /* Calculate the equivalent of:
6039
6040              vec = seq ? reduc_inputs[0] : vector_identity;
6041
6042              VEC is now suitable for a full vector reduction.  */
6043           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6044                                    sel, reduc_inputs[0], vector_identity);
6045
6046           /* Do the reduction and convert it to the appropriate type.  */
6047           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6048                                       TREE_TYPE (vectype), vec);
6049           scalar = gimple_convert (&seq, scalar_type, scalar);
6050           scalar_results.safe_push (scalar);
6051         }
6052       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6053     }
6054   else
6055     {
6056       bool reduce_with_shift;
6057       tree vec_temp;
6058
6059       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6060
6061       /* See if the target wants to do the final (shift) reduction
6062          in a vector mode of smaller size and first reduce upper/lower
6063          halves against each other.  */
6064       enum machine_mode mode1 = mode;
6065       tree stype = TREE_TYPE (vectype);
6066       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6067       unsigned nunits1 = nunits;
6068       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6069           && reduc_inputs.length () == 1)
6070         {
6071           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6072           /* For SLP reductions we have to make sure lanes match up, but
6073              since we're doing individual element final reduction reducing
6074              vector width here is even more important.
6075              ???  We can also separate lanes with permutes, for the common
6076              case of power-of-two group-size odd/even extracts would work.  */
6077           if (slp_reduc && nunits != nunits1)
6078             {
6079               nunits1 = least_common_multiple (nunits1, group_size);
6080               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6081             }
6082         }
6083       if (!slp_reduc
6084           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6085         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6086
6087       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6088                                                            stype, nunits1);
6089       reduce_with_shift = have_whole_vector_shift (mode1);
6090       if (!VECTOR_MODE_P (mode1)
6091           || !directly_supported_p (code, vectype1))
6092         reduce_with_shift = false;
6093
6094       /* First reduce the vector to the desired vector size we should
6095          do shift reduction on by combining upper and lower halves.  */
6096       gimple_seq stmts = NULL;
6097       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6098                                              code, &stmts);
6099       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6100       reduc_inputs[0] = new_temp;
6101
6102       if (reduce_with_shift && !slp_reduc)
6103         {
6104           int element_bitsize = tree_to_uhwi (bitsize);
6105           /* Enforced by vectorizable_reduction, which disallows SLP reductions
6106              for variable-length vectors and also requires direct target support
6107              for loop reductions.  */
6108           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6109           int nelements = vec_size_in_bits / element_bitsize;
6110           vec_perm_builder sel;
6111           vec_perm_indices indices;
6112
6113           int elt_offset;
6114
6115           tree zero_vec = build_zero_cst (vectype1);
6116           /* Case 2: Create:
6117              for (offset = nelements/2; offset >= 1; offset/=2)
6118                 {
6119                   Create:  va' = vec_shift <va, offset>
6120                   Create:  va = vop <va, va'>
6121                 }  */
6122
6123           tree rhs;
6124
6125           if (dump_enabled_p ())
6126             dump_printf_loc (MSG_NOTE, vect_location,
6127                              "Reduce using vector shifts\n");
6128
6129           gimple_seq stmts = NULL;
6130           new_temp = gimple_convert (&stmts, vectype1, new_temp);
6131           for (elt_offset = nelements / 2;
6132                elt_offset >= 1;
6133                elt_offset /= 2)
6134             {
6135               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6136               indices.new_vector (sel, 2, nelements);
6137               tree mask = vect_gen_perm_mask_any (vectype1, indices);
6138               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6139                                        new_temp, zero_vec, mask);
6140               new_temp = gimple_build (&stmts, code,
6141                                        vectype1, new_name, new_temp);
6142             }
6143           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6144
6145           /* 2.4  Extract the final scalar result.  Create:
6146              s_out3 = extract_field <v_out2, bitpos>  */
6147
6148           if (dump_enabled_p ())
6149             dump_printf_loc (MSG_NOTE, vect_location,
6150                              "extract scalar result\n");
6151
6152           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6153                         bitsize, bitsize_zero_node);
6154           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6155           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6156           gimple_assign_set_lhs (epilog_stmt, new_temp);
6157           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6158           scalar_results.safe_push (new_temp);
6159         }
6160       else
6161         {
6162           /* Case 3: Create:
6163              s = extract_field <v_out2, 0>
6164              for (offset = element_size;
6165                   offset < vector_size;
6166                   offset += element_size;)
6167                {
6168                  Create:  s' = extract_field <v_out2, offset>
6169                  Create:  s = op <s, s'>  // For non SLP cases
6170                }  */
6171
6172           if (dump_enabled_p ())
6173             dump_printf_loc (MSG_NOTE, vect_location,
6174                              "Reduce using scalar code.\n");
6175
6176           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6177           int element_bitsize = tree_to_uhwi (bitsize);
6178           tree compute_type = TREE_TYPE (vectype);
6179           gimple_seq stmts = NULL;
6180           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6181             {
6182               int bit_offset;
6183               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6184                                        vec_temp, bitsize, bitsize_zero_node);
6185
6186               /* In SLP we don't need to apply reduction operation, so we just
6187                  collect s' values in SCALAR_RESULTS.  */
6188               if (slp_reduc)
6189                 scalar_results.safe_push (new_temp);
6190
6191               for (bit_offset = element_bitsize;
6192                    bit_offset < vec_size_in_bits;
6193                    bit_offset += element_bitsize)
6194                 {
6195                   tree bitpos = bitsize_int (bit_offset);
6196                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
6197                                            compute_type, vec_temp,
6198                                            bitsize, bitpos);
6199                   if (slp_reduc)
6200                     {
6201                       /* In SLP we don't need to apply reduction operation, so
6202                          we just collect s' values in SCALAR_RESULTS.  */
6203                       new_temp = new_name;
6204                       scalar_results.safe_push (new_name);
6205                     }
6206                   else
6207                     new_temp = gimple_build (&stmts, code, compute_type,
6208                                              new_name, new_temp);
6209                 }
6210             }
6211
6212           /* The only case where we need to reduce scalar results in SLP, is
6213              unrolling.  If the size of SCALAR_RESULTS is greater than
6214              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6215              REDUC_GROUP_SIZE.  */
6216           if (slp_reduc)
6217             {
6218               tree res, first_res, new_res;
6219
6220               /* Reduce multiple scalar results in case of SLP unrolling.  */
6221               for (j = group_size; scalar_results.iterate (j, &res);
6222                    j++)
6223                 {
6224                   first_res = scalar_results[j % group_size];
6225                   new_res = gimple_build (&stmts, code, compute_type,
6226                                           first_res, res);
6227                   scalar_results[j % group_size] = new_res;
6228                 }
6229               scalar_results.truncate (group_size);
6230               for (k = 0; k < group_size; k++)
6231                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6232                                                     scalar_results[k]);
6233             }
6234           else
6235             {
6236               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6237               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6238               scalar_results.safe_push (new_temp);
6239             }
6240
6241           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6242         }
6243
6244       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6245           && induc_val)
6246         {
6247           /* Earlier we set the initial value to be a vector if induc_val
6248              values.  Check the result and if it is induc_val then replace
6249              with the original initial value, unless induc_val is
6250              the same as initial_def already.  */
6251           tree zcompare = make_ssa_name (boolean_type_node);
6252           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6253                                              induc_val);
6254           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6255           tree initial_def = reduc_info->reduc_initial_values[0];
6256           tree tmp = make_ssa_name (new_scalar_dest);
6257           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6258                                              initial_def, new_temp);
6259           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6260           scalar_results[0] = tmp;
6261         }
6262     }
6263
6264   /* 2.5 Adjust the final result by the initial value of the reduction
6265          variable. (When such adjustment is not needed, then
6266          'adjustment_def' is zero).  For example, if code is PLUS we create:
6267          new_temp = loop_exit_def + adjustment_def  */
6268
6269   if (adjustment_def)
6270     {
6271       gcc_assert (!slp_reduc);
6272       gimple_seq stmts = NULL;
6273       if (double_reduc)
6274         {
6275           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6276           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6277           new_temp = gimple_build (&stmts, code, vectype,
6278                                    reduc_inputs[0], adjustment_def);
6279         }
6280       else
6281         {
6282           new_temp = scalar_results[0];
6283           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6284           adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
6285           new_temp = gimple_build (&stmts, code, scalar_type,
6286                                    new_temp, adjustment_def);
6287         }
6288
6289       epilog_stmt = gimple_seq_last_stmt (stmts);
6290       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6291       scalar_results[0] = new_temp;
6292     }
6293
6294   /* Record this operation if it could be reused by the epilogue loop.  */
6295   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6296       && reduc_inputs.length () == 1)
6297     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6298                                            { orig_reduc_input, reduc_info });
6299
6300   if (double_reduc)
6301     loop = outer_loop;
6302
6303   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6304           phis with new adjusted scalar results, i.e., replace use <s_out0>
6305           with use <s_out4>.
6306
6307      Transform:
6308         loop_exit:
6309           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6310           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6311           v_out2 = reduce <v_out1>
6312           s_out3 = extract_field <v_out2, 0>
6313           s_out4 = adjust_result <s_out3>
6314           use <s_out0>
6315           use <s_out0>
6316
6317      into:
6318
6319         loop_exit:
6320           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6321           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6322           v_out2 = reduce <v_out1>
6323           s_out3 = extract_field <v_out2, 0>
6324           s_out4 = adjust_result <s_out3>
6325           use <s_out4>
6326           use <s_out4> */
6327
6328   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6329   for (k = 0; k < live_out_stmts.size (); k++)
6330     {
6331       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6332       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6333
6334       phis.create (3);
6335       /* Find the loop-closed-use at the loop exit of the original scalar
6336          result.  (The reduction result is expected to have two immediate uses,
6337          one at the latch block, and one at the loop exit).  For double
6338          reductions we are looking for exit phis of the outer loop.  */
6339       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6340         {
6341           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6342             {
6343               if (!is_gimple_debug (USE_STMT (use_p)))
6344                 phis.safe_push (USE_STMT (use_p));
6345             }
6346           else
6347             {
6348               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6349                 {
6350                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6351
6352                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6353                     {
6354                       if (!flow_bb_inside_loop_p (loop,
6355                                              gimple_bb (USE_STMT (phi_use_p)))
6356                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6357                         phis.safe_push (USE_STMT (phi_use_p));
6358                     }
6359                 }
6360             }
6361         }
6362
6363       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6364         {
6365           /* Replace the uses:  */
6366           orig_name = PHI_RESULT (exit_phi);
6367
6368           /* Look for a single use at the target of the skip edge.  */
6369           if (unify_with_main_loop_p)
6370             {
6371               use_operand_p use_p;
6372               gimple *user;
6373               if (!single_imm_use (orig_name, &use_p, &user))
6374                 gcc_unreachable ();
6375               orig_name = gimple_get_lhs (user);
6376             }
6377
6378           scalar_result = scalar_results[k];
6379           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6380             {
6381               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6382                 SET_USE (use_p, scalar_result);
6383               update_stmt (use_stmt);
6384             }
6385         }
6386
6387       phis.release ();
6388     }
6389 }
6390
6391 /* Return a vector of type VECTYPE that is equal to the vector select
6392    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6393    before GSI.  */
6394
6395 static tree
6396 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6397                      tree vec, tree identity)
6398 {
6399   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6400   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6401                                           mask, vec, identity);
6402   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6403   return cond;
6404 }
6405
6406 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6407    order, starting with LHS.  Insert the extraction statements before GSI and
6408    associate the new scalar SSA names with variable SCALAR_DEST.
6409    Return the SSA name for the result.  */
6410
6411 static tree
6412 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6413                        tree_code code, tree lhs, tree vector_rhs)
6414 {
6415   tree vectype = TREE_TYPE (vector_rhs);
6416   tree scalar_type = TREE_TYPE (vectype);
6417   tree bitsize = TYPE_SIZE (scalar_type);
6418   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6419   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6420
6421   for (unsigned HOST_WIDE_INT bit_offset = 0;
6422        bit_offset < vec_size_in_bits;
6423        bit_offset += element_bitsize)
6424     {
6425       tree bitpos = bitsize_int (bit_offset);
6426       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6427                          bitsize, bitpos);
6428
6429       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6430       rhs = make_ssa_name (scalar_dest, stmt);
6431       gimple_assign_set_lhs (stmt, rhs);
6432       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6433
6434       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6435       tree new_name = make_ssa_name (scalar_dest, stmt);
6436       gimple_assign_set_lhs (stmt, new_name);
6437       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6438       lhs = new_name;
6439     }
6440   return lhs;
6441 }
6442
6443 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6444    type of the vector input.  */
6445
6446 static internal_fn
6447 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6448 {
6449   internal_fn mask_reduc_fn;
6450
6451   switch (reduc_fn)
6452     {
6453     case IFN_FOLD_LEFT_PLUS:
6454       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6455       break;
6456
6457     default:
6458       return IFN_LAST;
6459     }
6460
6461   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6462                                       OPTIMIZE_FOR_SPEED))
6463     return mask_reduc_fn;
6464   return IFN_LAST;
6465 }
6466
6467 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6468    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6469    statement.  CODE is the operation performed by STMT_INFO and OPS are
6470    its scalar operands.  REDUC_INDEX is the index of the operand in
6471    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6472    implements in-order reduction, or IFN_LAST if we should open-code it.
6473    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6474    that should be used to control the operation in a fully-masked loop.  */
6475
6476 static bool
6477 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6478                                stmt_vec_info stmt_info,
6479                                gimple_stmt_iterator *gsi,
6480                                gimple **vec_stmt, slp_tree slp_node,
6481                                gimple *reduc_def_stmt,
6482                                tree_code code, internal_fn reduc_fn,
6483                                tree ops[3], tree vectype_in,
6484                                int reduc_index, vec_loop_masks *masks)
6485 {
6486   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6487   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6488   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6489
6490   int ncopies;
6491   if (slp_node)
6492     ncopies = 1;
6493   else
6494     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6495
6496   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6497   gcc_assert (ncopies == 1);
6498   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6499
6500   if (slp_node)
6501     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6502                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6503
6504   tree op0 = ops[1 - reduc_index];
6505
6506   int group_size = 1;
6507   stmt_vec_info scalar_dest_def_info;
6508   auto_vec<tree> vec_oprnds0;
6509   if (slp_node)
6510     {
6511       auto_vec<vec<tree> > vec_defs (2);
6512       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6513       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6514       vec_defs[0].release ();
6515       vec_defs[1].release ();
6516       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6517       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6518     }
6519   else
6520     {
6521       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6522                                      op0, &vec_oprnds0);
6523       scalar_dest_def_info = stmt_info;
6524     }
6525
6526   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6527   tree scalar_type = TREE_TYPE (scalar_dest);
6528   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6529
6530   int vec_num = vec_oprnds0.length ();
6531   gcc_assert (vec_num == 1 || slp_node);
6532   tree vec_elem_type = TREE_TYPE (vectype_out);
6533   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6534
6535   tree vector_identity = NULL_TREE;
6536   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6537     vector_identity = build_zero_cst (vectype_out);
6538
6539   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6540   int i;
6541   tree def0;
6542   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6543     {
6544       gimple *new_stmt;
6545       tree mask = NULL_TREE;
6546       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6547         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6548
6549       /* Handle MINUS by adding the negative.  */
6550       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6551         {
6552           tree negated = make_ssa_name (vectype_out);
6553           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6554           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6555           def0 = negated;
6556         }
6557
6558       if (mask && mask_reduc_fn == IFN_LAST)
6559         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6560                                     vector_identity);
6561
6562       /* On the first iteration the input is simply the scalar phi
6563          result, and for subsequent iterations it is the output of
6564          the preceding operation.  */
6565       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6566         {
6567           if (mask && mask_reduc_fn != IFN_LAST)
6568             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6569                                                    def0, mask);
6570           else
6571             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6572                                                    def0);
6573           /* For chained SLP reductions the output of the previous reduction
6574              operation serves as the input of the next. For the final statement
6575              the output cannot be a temporary - we reuse the original
6576              scalar destination of the last statement.  */
6577           if (i != vec_num - 1)
6578             {
6579               gimple_set_lhs (new_stmt, scalar_dest_var);
6580               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6581               gimple_set_lhs (new_stmt, reduc_var);
6582             }
6583         }
6584       else
6585         {
6586           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6587                                              reduc_var, def0);
6588           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6589           /* Remove the statement, so that we can use the same code paths
6590              as for statements that we've just created.  */
6591           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6592           gsi_remove (&tmp_gsi, true);
6593         }
6594
6595       if (i == vec_num - 1)
6596         {
6597           gimple_set_lhs (new_stmt, scalar_dest);
6598           vect_finish_replace_stmt (loop_vinfo,
6599                                     scalar_dest_def_info,
6600                                     new_stmt);
6601         }
6602       else
6603         vect_finish_stmt_generation (loop_vinfo,
6604                                      scalar_dest_def_info,
6605                                      new_stmt, gsi);
6606
6607       if (slp_node)
6608         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6609       else
6610         {
6611           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6612           *vec_stmt = new_stmt;
6613         }
6614     }
6615
6616   return true;
6617 }
6618
6619 /* Function is_nonwrapping_integer_induction.
6620
6621    Check if STMT_VINO (which is part of loop LOOP) both increments and
6622    does not cause overflow.  */
6623
6624 static bool
6625 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6626 {
6627   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6628   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6629   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6630   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6631   widest_int ni, max_loop_value, lhs_max;
6632   wi::overflow_type overflow = wi::OVF_NONE;
6633
6634   /* Make sure the loop is integer based.  */
6635   if (TREE_CODE (base) != INTEGER_CST
6636       || TREE_CODE (step) != INTEGER_CST)
6637     return false;
6638
6639   /* Check that the max size of the loop will not wrap.  */
6640
6641   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6642     return true;
6643
6644   if (! max_stmt_executions (loop, &ni))
6645     return false;
6646
6647   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6648                             &overflow);
6649   if (overflow)
6650     return false;
6651
6652   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6653                             TYPE_SIGN (lhs_type), &overflow);
6654   if (overflow)
6655     return false;
6656
6657   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6658           <= TYPE_PRECISION (lhs_type));
6659 }
6660
6661 /* Check if masking can be supported by inserting a conditional expression.
6662    CODE is the code for the operation.  COND_FN is the conditional internal
6663    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
6664 static bool
6665 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
6666                          tree vectype_in)
6667 {
6668   if (cond_fn != IFN_LAST
6669       && direct_internal_fn_supported_p (cond_fn, vectype_in,
6670                                          OPTIMIZE_FOR_SPEED))
6671     return false;
6672
6673   if (code.is_tree_code ())
6674     switch (tree_code (code))
6675       {
6676       case DOT_PROD_EXPR:
6677       case SAD_EXPR:
6678         return true;
6679
6680       default:
6681         break;
6682       }
6683   return false;
6684 }
6685
6686 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6687    code for the operation.  VOP is the array of operands.  MASK is the loop
6688    mask.  GSI is a statement iterator used to place the new conditional
6689    expression.  */
6690 static void
6691 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
6692                       gimple_stmt_iterator *gsi)
6693 {
6694   switch (tree_code (code))
6695     {
6696     case DOT_PROD_EXPR:
6697       {
6698         tree vectype = TREE_TYPE (vop[1]);
6699         tree zero = build_zero_cst (vectype);
6700         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6701         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6702                                                mask, vop[1], zero);
6703         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6704         vop[1] = masked_op1;
6705         break;
6706       }
6707
6708     case SAD_EXPR:
6709       {
6710         tree vectype = TREE_TYPE (vop[1]);
6711         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6712         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6713                                                mask, vop[1], vop[0]);
6714         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6715         vop[1] = masked_op1;
6716         break;
6717       }
6718
6719     default:
6720       gcc_unreachable ();
6721     }
6722 }
6723
6724 /* Function vectorizable_reduction.
6725
6726    Check if STMT_INFO performs a reduction operation that can be vectorized.
6727    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6728    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6729    Return true if STMT_INFO is vectorizable in this way.
6730
6731    This function also handles reduction idioms (patterns) that have been
6732    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6733    may be of this form:
6734      X = pattern_expr (arg0, arg1, ..., X)
6735    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6736    sequence that had been detected and replaced by the pattern-stmt
6737    (STMT_INFO).
6738
6739    This function also handles reduction of condition expressions, for example:
6740      for (int i = 0; i < N; i++)
6741        if (a[i] < value)
6742          last = a[i];
6743    This is handled by vectorising the loop and creating an additional vector
6744    containing the loop indexes for which "a[i] < value" was true.  In the
6745    function epilogue this is reduced to a single max value and then used to
6746    index into the vector of results.
6747
6748    In some cases of reduction patterns, the type of the reduction variable X is
6749    different than the type of the other arguments of STMT_INFO.
6750    In such cases, the vectype that is used when transforming STMT_INFO into
6751    a vector stmt is different than the vectype that is used to determine the
6752    vectorization factor, because it consists of a different number of elements
6753    than the actual number of elements that are being operated upon in parallel.
6754
6755    For example, consider an accumulation of shorts into an int accumulator.
6756    On some targets it's possible to vectorize this pattern operating on 8
6757    shorts at a time (hence, the vectype for purposes of determining the
6758    vectorization factor should be V8HI); on the other hand, the vectype that
6759    is used to create the vector form is actually V4SI (the type of the result).
6760
6761    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6762    indicates what is the actual level of parallelism (V8HI in the example), so
6763    that the right vectorization factor would be derived.  This vectype
6764    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6765    be used to create the vectorized stmt.  The right vectype for the vectorized
6766    stmt is obtained from the type of the result X:
6767       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6768
6769    This means that, contrary to "regular" reductions (or "regular" stmts in
6770    general), the following equation:
6771       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6772    does *NOT* necessarily hold for reduction patterns.  */
6773
6774 bool
6775 vectorizable_reduction (loop_vec_info loop_vinfo,
6776                         stmt_vec_info stmt_info, slp_tree slp_node,
6777                         slp_instance slp_node_instance,
6778                         stmt_vector_for_cost *cost_vec)
6779 {
6780   tree vectype_in = NULL_TREE;
6781   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6782   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6783   stmt_vec_info cond_stmt_vinfo = NULL;
6784   int i;
6785   int ncopies;
6786   bool single_defuse_cycle = false;
6787   bool nested_cycle = false;
6788   bool double_reduc = false;
6789   int vec_num;
6790   tree tem;
6791   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6792   tree cond_reduc_val = NULL_TREE;
6793
6794   /* Make sure it was already recognized as a reduction computation.  */
6795   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6796       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6797       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6798     return false;
6799
6800   /* The stmt we store reduction analysis meta on.  */
6801   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6802   reduc_info->is_reduc_info = true;
6803
6804   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6805     {
6806       if (is_a <gphi *> (stmt_info->stmt))
6807         {
6808           if (slp_node)
6809             {
6810               /* We eventually need to set a vector type on invariant
6811                  arguments.  */
6812               unsigned j;
6813               slp_tree child;
6814               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6815                 if (!vect_maybe_update_slp_op_vectype
6816                        (child, SLP_TREE_VECTYPE (slp_node)))
6817                   {
6818                     if (dump_enabled_p ())
6819                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6820                                        "incompatible vector types for "
6821                                        "invariants\n");
6822                     return false;
6823                   }
6824             }
6825           /* Analysis for double-reduction is done on the outer
6826              loop PHI, nested cycles have no further restrictions.  */
6827           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6828         }
6829       else
6830         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6831       return true;
6832     }
6833
6834   stmt_vec_info orig_stmt_of_analysis = stmt_info;
6835   stmt_vec_info phi_info = stmt_info;
6836   if (!is_a <gphi *> (stmt_info->stmt))
6837     {
6838       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6839       return true;
6840     }
6841   if (slp_node)
6842     {
6843       slp_node_instance->reduc_phis = slp_node;
6844       /* ???  We're leaving slp_node to point to the PHIs, we only
6845          need it to get at the number of vector stmts which wasn't
6846          yet initialized for the instance root.  */
6847     }
6848   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6849     {
6850       use_operand_p use_p;
6851       gimple *use_stmt;
6852       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6853                                  &use_p, &use_stmt);
6854       gcc_assert (res);
6855       phi_info = loop_vinfo->lookup_stmt (use_stmt);
6856     }
6857
6858   /* PHIs should not participate in patterns.  */
6859   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6860   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6861
6862   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6863      and compute the reduction chain length.  Discover the real
6864      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
6865   tree reduc_def
6866     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6867                              loop_latch_edge
6868                                (gimple_bb (reduc_def_phi)->loop_father));
6869   unsigned reduc_chain_length = 0;
6870   bool only_slp_reduc_chain = true;
6871   stmt_info = NULL;
6872   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6873   while (reduc_def != PHI_RESULT (reduc_def_phi))
6874     {
6875       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6876       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6877       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6878         {
6879           if (dump_enabled_p ())
6880             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6881                              "reduction chain broken by patterns.\n");
6882           return false;
6883         }
6884       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6885         only_slp_reduc_chain = false;
6886       /* For epilogue generation live members of the chain need
6887          to point back to the PHI via their original stmt for
6888          info_for_reduction to work.  For SLP we need to look at
6889          all lanes here - even though we only will vectorize from
6890          the SLP node with live lane zero the other live lanes also
6891          need to be identified as part of a reduction to be able
6892          to skip code generation for them.  */
6893       if (slp_for_stmt_info)
6894         {
6895           for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
6896             if (STMT_VINFO_LIVE_P (s))
6897               STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
6898         }
6899       else if (STMT_VINFO_LIVE_P (vdef))
6900         STMT_VINFO_REDUC_DEF (def) = phi_info;
6901       gimple_match_op op;
6902       if (!gimple_extract_op (vdef->stmt, &op))
6903         {
6904           if (dump_enabled_p ())
6905             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6906                              "reduction chain includes unsupported"
6907                              " statement type.\n");
6908           return false;
6909         }
6910       if (CONVERT_EXPR_CODE_P (op.code))
6911         {
6912           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
6913             {
6914               if (dump_enabled_p ())
6915                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6916                                  "conversion in the reduction chain.\n");
6917               return false;
6918             }
6919         }
6920       else if (!stmt_info)
6921         /* First non-conversion stmt.  */
6922         stmt_info = vdef;
6923       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
6924       reduc_chain_length++;
6925       if (!stmt_info && slp_node)
6926         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6927     }
6928   /* PHIs should not participate in patterns.  */
6929   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6930
6931   if (nested_in_vect_loop_p (loop, stmt_info))
6932     {
6933       loop = loop->inner;
6934       nested_cycle = true;
6935     }
6936
6937   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6938      element.  */
6939   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6940     {
6941       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6942       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6943     }
6944   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6945     gcc_assert (slp_node
6946                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6947
6948   /* 1. Is vectorizable reduction?  */
6949   /* Not supportable if the reduction variable is used in the loop, unless
6950      it's a reduction chain.  */
6951   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6952       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6953     return false;
6954
6955   /* Reductions that are not used even in an enclosing outer-loop,
6956      are expected to be "live" (used out of the loop).  */
6957   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6958       && !STMT_VINFO_LIVE_P (stmt_info))
6959     return false;
6960
6961   /* 2. Has this been recognized as a reduction pattern?
6962
6963      Check if STMT represents a pattern that has been recognized
6964      in earlier analysis stages.  For stmts that represent a pattern,
6965      the STMT_VINFO_RELATED_STMT field records the last stmt in
6966      the original sequence that constitutes the pattern.  */
6967
6968   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6969   if (orig_stmt_info)
6970     {
6971       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6972       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6973     }
6974
6975   /* 3. Check the operands of the operation.  The first operands are defined
6976         inside the loop body. The last operand is the reduction variable,
6977         which is defined by the loop-header-phi.  */
6978
6979   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6980   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6981   gimple_match_op op;
6982   if (!gimple_extract_op (stmt_info->stmt, &op))
6983     gcc_unreachable ();
6984   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
6985                             || op.code == WIDEN_SUM_EXPR
6986                             || op.code == SAD_EXPR);
6987
6988   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
6989       && !SCALAR_FLOAT_TYPE_P (op.type))
6990     return false;
6991
6992   /* Do not try to vectorize bit-precision reductions.  */
6993   if (!type_has_mode_precision_p (op.type))
6994     return false;
6995
6996   /* For lane-reducing ops we're reducing the number of reduction PHIs
6997      which means the only use of that may be in the lane-reducing operation.  */
6998   if (lane_reduc_code_p
6999       && reduc_chain_length != 1
7000       && !only_slp_reduc_chain)
7001     {
7002       if (dump_enabled_p ())
7003         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7004                          "lane-reducing reduction with extra stmts.\n");
7005       return false;
7006     }
7007
7008   /* All uses but the last are expected to be defined in the loop.
7009      The last use is the reduction variable.  In case of nested cycle this
7010      assumption is not true: we use reduc_index to record the index of the
7011      reduction variable.  */
7012   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7013   /* We need to skip an extra operand for COND_EXPRs with embedded
7014      comparison.  */
7015   unsigned opno_adjust = 0;
7016   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7017     opno_adjust = 1;
7018   for (i = 0; i < (int) op.num_ops; i++)
7019     {
7020       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
7021       if (i == 0 && op.code == COND_EXPR)
7022         continue;
7023
7024       stmt_vec_info def_stmt_info;
7025       enum vect_def_type dt;
7026       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7027                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7028                                &tem, &def_stmt_info))
7029         {
7030           if (dump_enabled_p ())
7031             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7032                              "use not simple.\n");
7033           return false;
7034         }
7035       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7036         continue;
7037
7038       /* There should be only one cycle def in the stmt, the one
7039          leading to reduc_def.  */
7040       if (VECTORIZABLE_CYCLE_DEF (dt))
7041         return false;
7042
7043       /* To properly compute ncopies we are interested in the widest
7044          non-reduction input type in case we're looking at a widening
7045          accumulation that we later handle in vect_transform_reduction.  */
7046       if (lane_reduc_code_p
7047           && tem
7048           && (!vectype_in
7049               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7050                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
7051         vectype_in = tem;
7052
7053       if (op.code == COND_EXPR)
7054         {
7055           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
7056           if (dt == vect_constant_def)
7057             {
7058               cond_reduc_dt = dt;
7059               cond_reduc_val = op.ops[i];
7060             }
7061           if (dt == vect_induction_def
7062               && def_stmt_info
7063               && is_nonwrapping_integer_induction (def_stmt_info, loop))
7064             {
7065               cond_reduc_dt = dt;
7066               cond_stmt_vinfo = def_stmt_info;
7067             }
7068         }
7069     }
7070   if (!vectype_in)
7071     vectype_in = STMT_VINFO_VECTYPE (phi_info);
7072   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7073
7074   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7075   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7076   /* If we have a condition reduction, see if we can simplify it further.  */
7077   if (v_reduc_type == COND_REDUCTION)
7078     {
7079       if (slp_node)
7080         return false;
7081
7082       /* When the condition uses the reduction value in the condition, fail.  */
7083       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7084         {
7085           if (dump_enabled_p ())
7086             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7087                              "condition depends on previous iteration\n");
7088           return false;
7089         }
7090
7091       if (reduc_chain_length == 1
7092           && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
7093                                              vectype_in, OPTIMIZE_FOR_SPEED))
7094         {
7095           if (dump_enabled_p ())
7096             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7097                              "optimizing condition reduction with"
7098                              " FOLD_EXTRACT_LAST.\n");
7099           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7100         }
7101       else if (cond_reduc_dt == vect_induction_def)
7102         {
7103           tree base
7104             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7105           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7106
7107           gcc_assert (TREE_CODE (base) == INTEGER_CST
7108                       && TREE_CODE (step) == INTEGER_CST);
7109           cond_reduc_val = NULL_TREE;
7110           enum tree_code cond_reduc_op_code = ERROR_MARK;
7111           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7112           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7113             ;
7114           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7115              above base; punt if base is the minimum value of the type for
7116              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
7117           else if (tree_int_cst_sgn (step) == -1)
7118             {
7119               cond_reduc_op_code = MIN_EXPR;
7120               if (tree_int_cst_sgn (base) == -1)
7121                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7122               else if (tree_int_cst_lt (base,
7123                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
7124                 cond_reduc_val
7125                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
7126             }
7127           else
7128             {
7129               cond_reduc_op_code = MAX_EXPR;
7130               if (tree_int_cst_sgn (base) == 1)
7131                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7132               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7133                                         base))
7134                 cond_reduc_val
7135                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
7136             }
7137           if (cond_reduc_val)
7138             {
7139               if (dump_enabled_p ())
7140                 dump_printf_loc (MSG_NOTE, vect_location,
7141                                  "condition expression based on "
7142                                  "integer induction.\n");
7143               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7144               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7145                 = cond_reduc_val;
7146               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7147             }
7148         }
7149       else if (cond_reduc_dt == vect_constant_def)
7150         {
7151           enum vect_def_type cond_initial_dt;
7152           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7153           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7154           if (cond_initial_dt == vect_constant_def
7155               && types_compatible_p (TREE_TYPE (cond_initial_val),
7156                                      TREE_TYPE (cond_reduc_val)))
7157             {
7158               tree e = fold_binary (LE_EXPR, boolean_type_node,
7159                                     cond_initial_val, cond_reduc_val);
7160               if (e && (integer_onep (e) || integer_zerop (e)))
7161                 {
7162                   if (dump_enabled_p ())
7163                     dump_printf_loc (MSG_NOTE, vect_location,
7164                                      "condition expression based on "
7165                                      "compile time constant.\n");
7166                   /* Record reduction code at analysis stage.  */
7167                   STMT_VINFO_REDUC_CODE (reduc_info)
7168                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7169                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7170                 }
7171             }
7172         }
7173     }
7174
7175   if (STMT_VINFO_LIVE_P (phi_info))
7176     return false;
7177
7178   if (slp_node)
7179     ncopies = 1;
7180   else
7181     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7182
7183   gcc_assert (ncopies >= 1);
7184
7185   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7186
7187   if (nested_cycle)
7188     {
7189       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7190                   == vect_double_reduction_def);
7191       double_reduc = true;
7192     }
7193
7194   /* 4.2. Check support for the epilog operation.
7195
7196           If STMT represents a reduction pattern, then the type of the
7197           reduction variable may be different than the type of the rest
7198           of the arguments.  For example, consider the case of accumulation
7199           of shorts into an int accumulator; The original code:
7200                         S1: int_a = (int) short_a;
7201           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7202
7203           was replaced with:
7204                         STMT: int_acc = widen_sum <short_a, int_acc>
7205
7206           This means that:
7207           1. The tree-code that is used to create the vector operation in the
7208              epilog code (that reduces the partial results) is not the
7209              tree-code of STMT, but is rather the tree-code of the original
7210              stmt from the pattern that STMT is replacing.  I.e, in the example
7211              above we want to use 'widen_sum' in the loop, but 'plus' in the
7212              epilog.
7213           2. The type (mode) we use to check available target support
7214              for the vector operation to be created in the *epilog*, is
7215              determined by the type of the reduction variable (in the example
7216              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7217              However the type (mode) we use to check available target support
7218              for the vector operation to be created *inside the loop*, is
7219              determined by the type of the other arguments to STMT (in the
7220              example we'd check this: optab_handler (widen_sum_optab,
7221              vect_short_mode)).
7222
7223           This is contrary to "regular" reductions, in which the types of all
7224           the arguments are the same as the type of the reduction variable.
7225           For "regular" reductions we can therefore use the same vector type
7226           (and also the same tree-code) when generating the epilog code and
7227           when generating the code inside the loop.  */
7228
7229   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7230   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7231
7232   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7233   if (reduction_type == TREE_CODE_REDUCTION)
7234     {
7235       /* Check whether it's ok to change the order of the computation.
7236          Generally, when vectorizing a reduction we change the order of the
7237          computation.  This may change the behavior of the program in some
7238          cases, so we need to check that this is ok.  One exception is when
7239          vectorizing an outer-loop: the inner-loop is executed sequentially,
7240          and therefore vectorizing reductions in the inner-loop during
7241          outer-loop vectorization is safe.  Likewise when we are vectorizing
7242          a series of reductions using SLP and the VF is one the reductions
7243          are performed in scalar order.  */
7244       if (slp_node
7245           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7246           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7247         ;
7248       else if (needs_fold_left_reduction_p (op.type, orig_code))
7249         {
7250           /* When vectorizing a reduction chain w/o SLP the reduction PHI
7251              is not directy used in stmt.  */
7252           if (!only_slp_reduc_chain
7253               && reduc_chain_length != 1)
7254             {
7255               if (dump_enabled_p ())
7256                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7257                                  "in-order reduction chain without SLP.\n");
7258               return false;
7259             }
7260           STMT_VINFO_REDUC_TYPE (reduc_info)
7261             = reduction_type = FOLD_LEFT_REDUCTION;
7262         }
7263       else if (!commutative_binary_op_p (orig_code, op.type)
7264                || !associative_binary_op_p (orig_code, op.type))
7265         {
7266           if (dump_enabled_p ())
7267             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7268                             "reduction: not commutative/associative");
7269           return false;
7270         }
7271     }
7272
7273   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7274       && ncopies > 1)
7275     {
7276       if (dump_enabled_p ())
7277         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7278                          "multiple types in double reduction or condition "
7279                          "reduction or fold-left reduction.\n");
7280       return false;
7281     }
7282
7283   internal_fn reduc_fn = IFN_LAST;
7284   if (reduction_type == TREE_CODE_REDUCTION
7285       || reduction_type == FOLD_LEFT_REDUCTION
7286       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7287       || reduction_type == CONST_COND_REDUCTION)
7288     {
7289       if (reduction_type == FOLD_LEFT_REDUCTION
7290           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7291           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7292         {
7293           if (reduc_fn != IFN_LAST
7294               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7295                                                   OPTIMIZE_FOR_SPEED))
7296             {
7297               if (dump_enabled_p ())
7298                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7299                                  "reduc op not supported by target.\n");
7300
7301               reduc_fn = IFN_LAST;
7302             }
7303         }
7304       else
7305         {
7306           if (!nested_cycle || double_reduc)
7307             {
7308               if (dump_enabled_p ())
7309                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7310                                  "no reduc code for scalar code.\n");
7311
7312               return false;
7313             }
7314         }
7315     }
7316   else if (reduction_type == COND_REDUCTION)
7317     {
7318       int scalar_precision
7319         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7320       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7321       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7322                                                 vectype_out);
7323
7324       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7325                                           OPTIMIZE_FOR_SPEED))
7326         reduc_fn = IFN_REDUC_MAX;
7327     }
7328   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7329
7330   if (reduction_type != EXTRACT_LAST_REDUCTION
7331       && (!nested_cycle || double_reduc)
7332       && reduc_fn == IFN_LAST
7333       && !nunits_out.is_constant ())
7334     {
7335       if (dump_enabled_p ())
7336         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7337                          "missing target support for reduction on"
7338                          " variable-length vectors.\n");
7339       return false;
7340     }
7341
7342   /* For SLP reductions, see if there is a neutral value we can use.  */
7343   tree neutral_op = NULL_TREE;
7344   if (slp_node)
7345     {
7346       tree initial_value = NULL_TREE;
7347       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7348         initial_value = vect_phi_initial_value (reduc_def_phi);
7349       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7350                                              orig_code, initial_value);
7351     }
7352
7353   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7354     {
7355       /* We can't support in-order reductions of code such as this:
7356
7357            for (int i = 0; i < n1; ++i)
7358              for (int j = 0; j < n2; ++j)
7359                l += a[j];
7360
7361          since GCC effectively transforms the loop when vectorizing:
7362
7363            for (int i = 0; i < n1 / VF; ++i)
7364              for (int j = 0; j < n2; ++j)
7365                for (int k = 0; k < VF; ++k)
7366                  l += a[j];
7367
7368          which is a reassociation of the original operation.  */
7369       if (dump_enabled_p ())
7370         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7371                          "in-order double reduction not supported.\n");
7372
7373       return false;
7374     }
7375
7376   if (reduction_type == FOLD_LEFT_REDUCTION
7377       && slp_node
7378       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7379     {
7380       /* We cannot use in-order reductions in this case because there is
7381          an implicit reassociation of the operations involved.  */
7382       if (dump_enabled_p ())
7383         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7384                          "in-order unchained SLP reductions not supported.\n");
7385       return false;
7386     }
7387
7388   /* For double reductions, and for SLP reductions with a neutral value,
7389      we construct a variable-length initial vector by loading a vector
7390      full of the neutral value and then shift-and-inserting the start
7391      values into the low-numbered elements.  */
7392   if ((double_reduc || neutral_op)
7393       && !nunits_out.is_constant ()
7394       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7395                                           vectype_out, OPTIMIZE_FOR_SPEED))
7396     {
7397       if (dump_enabled_p ())
7398         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7399                          "reduction on variable-length vectors requires"
7400                          " target support for a vector-shift-and-insert"
7401                          " operation.\n");
7402       return false;
7403     }
7404
7405   /* Check extra constraints for variable-length unchained SLP reductions.  */
7406   if (STMT_SLP_TYPE (stmt_info)
7407       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7408       && !nunits_out.is_constant ())
7409     {
7410       /* We checked above that we could build the initial vector when
7411          there's a neutral element value.  Check here for the case in
7412          which each SLP statement has its own initial value and in which
7413          that value needs to be repeated for every instance of the
7414          statement within the initial vector.  */
7415       unsigned int group_size = SLP_TREE_LANES (slp_node);
7416       if (!neutral_op
7417           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7418                                               TREE_TYPE (vectype_out)))
7419         {
7420           if (dump_enabled_p ())
7421             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7422                              "unsupported form of SLP reduction for"
7423                              " variable-length vectors: cannot build"
7424                              " initial vector.\n");
7425           return false;
7426         }
7427       /* The epilogue code relies on the number of elements being a multiple
7428          of the group size.  The duplicate-and-interleave approach to setting
7429          up the initial vector does too.  */
7430       if (!multiple_p (nunits_out, group_size))
7431         {
7432           if (dump_enabled_p ())
7433             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7434                              "unsupported form of SLP reduction for"
7435                              " variable-length vectors: the vector size"
7436                              " is not a multiple of the number of results.\n");
7437           return false;
7438         }
7439     }
7440
7441   if (reduction_type == COND_REDUCTION)
7442     {
7443       widest_int ni;
7444
7445       if (! max_loop_iterations (loop, &ni))
7446         {
7447           if (dump_enabled_p ())
7448             dump_printf_loc (MSG_NOTE, vect_location,
7449                              "loop count not known, cannot create cond "
7450                              "reduction.\n");
7451           return false;
7452         }
7453       /* Convert backedges to iterations.  */
7454       ni += 1;
7455
7456       /* The additional index will be the same type as the condition.  Check
7457          that the loop can fit into this less one (because we'll use up the
7458          zero slot for when there are no matches).  */
7459       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7460       if (wi::geu_p (ni, wi::to_widest (max_index)))
7461         {
7462           if (dump_enabled_p ())
7463             dump_printf_loc (MSG_NOTE, vect_location,
7464                              "loop size is greater than data size.\n");
7465           return false;
7466         }
7467     }
7468
7469   /* In case the vectorization factor (VF) is bigger than the number
7470      of elements that we can fit in a vectype (nunits), we have to generate
7471      more than one vector stmt - i.e - we need to "unroll" the
7472      vector stmt by a factor VF/nunits.  For more details see documentation
7473      in vectorizable_operation.  */
7474
7475   /* If the reduction is used in an outer loop we need to generate
7476      VF intermediate results, like so (e.g. for ncopies=2):
7477         r0 = phi (init, r0)
7478         r1 = phi (init, r1)
7479         r0 = x0 + r0;
7480         r1 = x1 + r1;
7481     (i.e. we generate VF results in 2 registers).
7482     In this case we have a separate def-use cycle for each copy, and therefore
7483     for each copy we get the vector def for the reduction variable from the
7484     respective phi node created for this copy.
7485
7486     Otherwise (the reduction is unused in the loop nest), we can combine
7487     together intermediate results, like so (e.g. for ncopies=2):
7488         r = phi (init, r)
7489         r = x0 + r;
7490         r = x1 + r;
7491    (i.e. we generate VF/2 results in a single register).
7492    In this case for each copy we get the vector def for the reduction variable
7493    from the vectorized reduction operation generated in the previous iteration.
7494
7495    This only works when we see both the reduction PHI and its only consumer
7496    in vectorizable_reduction and there are no intermediate stmts
7497    participating.  When unrolling we want each unrolled iteration to have its
7498    own reduction accumulator since one of the main goals of unrolling a
7499    reduction is to reduce the aggregate loop-carried latency.  */
7500   if (ncopies > 1
7501       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7502       && reduc_chain_length == 1
7503       && loop_vinfo->suggested_unroll_factor == 1)
7504     single_defuse_cycle = true;
7505
7506   if (single_defuse_cycle || lane_reduc_code_p)
7507     {
7508       gcc_assert (op.code != COND_EXPR);
7509
7510       /* 4. Supportable by target?  */
7511       bool ok = true;
7512
7513       /* 4.1. check support for the operation in the loop
7514
7515          This isn't necessary for the lane reduction codes, since they
7516          can only be produced by pattern matching, and it's up to the
7517          pattern matcher to test for support.  The main reason for
7518          specifically skipping this step is to avoid rechecking whether
7519          mixed-sign dot-products can be implemented using signed
7520          dot-products.  */
7521       machine_mode vec_mode = TYPE_MODE (vectype_in);
7522       if (!lane_reduc_code_p
7523           && !directly_supported_p (op.code, vectype_in, optab_vector))
7524         {
7525           if (dump_enabled_p ())
7526             dump_printf (MSG_NOTE, "op not supported by target.\n");
7527           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7528               || !vect_can_vectorize_without_simd_p (op.code))
7529             ok = false;
7530           else
7531             if (dump_enabled_p ())
7532               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7533         }
7534
7535       if (vect_emulated_vector_p (vectype_in)
7536           && !vect_can_vectorize_without_simd_p (op.code))
7537         {
7538           if (dump_enabled_p ())
7539             dump_printf (MSG_NOTE, "using word mode not possible.\n");
7540           return false;
7541         }
7542
7543       /* lane-reducing operations have to go through vect_transform_reduction.
7544          For the other cases try without the single cycle optimization.  */
7545       if (!ok)
7546         {
7547           if (lane_reduc_code_p)
7548             return false;
7549           else
7550             single_defuse_cycle = false;
7551         }
7552     }
7553   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7554
7555   /* If the reduction stmt is one of the patterns that have lane
7556      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7557   if ((ncopies > 1 && ! single_defuse_cycle)
7558       && lane_reduc_code_p)
7559     {
7560       if (dump_enabled_p ())
7561         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7562                          "multi def-use cycle not possible for lane-reducing "
7563                          "reduction operation\n");
7564       return false;
7565     }
7566
7567   if (slp_node
7568       && !(!single_defuse_cycle
7569            && !lane_reduc_code_p
7570            && reduction_type != FOLD_LEFT_REDUCTION))
7571     for (i = 0; i < (int) op.num_ops; i++)
7572       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7573         {
7574           if (dump_enabled_p ())
7575             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7576                              "incompatible vector types for invariants\n");
7577           return false;
7578         }
7579
7580   if (slp_node)
7581     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7582   else
7583     vec_num = 1;
7584
7585   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7586                              reduction_type, ncopies, cost_vec);
7587   /* Cost the reduction op inside the loop if transformed via
7588      vect_transform_reduction.  Otherwise this is costed by the
7589      separate vectorizable_* routines.  */
7590   if (single_defuse_cycle || lane_reduc_code_p)
7591     {
7592       int factor = 1;
7593       if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
7594         /* Three dot-products and a subtraction.  */
7595         factor = 4;
7596       record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
7597                         stmt_info, 0, vect_body);
7598     }
7599
7600   if (dump_enabled_p ()
7601       && reduction_type == FOLD_LEFT_REDUCTION)
7602     dump_printf_loc (MSG_NOTE, vect_location,
7603                      "using an in-order (fold-left) reduction.\n");
7604   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7605   /* All but single defuse-cycle optimized, lane-reducing and fold-left
7606      reductions go through their own vectorizable_* routines.  */
7607   if (!single_defuse_cycle
7608       && !lane_reduc_code_p
7609       && reduction_type != FOLD_LEFT_REDUCTION)
7610     {
7611       stmt_vec_info tem
7612         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7613       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7614         {
7615           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7616           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7617         }
7618       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7619       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7620     }
7621   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7622     {
7623       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7624       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
7625
7626       if (reduction_type != FOLD_LEFT_REDUCTION
7627           && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
7628           && (cond_fn == IFN_LAST
7629               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7630                                                   OPTIMIZE_FOR_SPEED)))
7631         {
7632           if (dump_enabled_p ())
7633             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7634                              "can't operate on partial vectors because"
7635                              " no conditional operation is available.\n");
7636           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7637         }
7638       else if (reduction_type == FOLD_LEFT_REDUCTION
7639                && reduc_fn == IFN_LAST
7640                && !expand_vec_cond_expr_p (vectype_in,
7641                                            truth_type_for (vectype_in),
7642                                            SSA_NAME))
7643         {
7644           if (dump_enabled_p ())
7645             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7646                              "can't operate on partial vectors because"
7647                              " no conditional operation is available.\n");
7648           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7649         }
7650       else
7651         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7652                                vectype_in, NULL);
7653     }
7654   return true;
7655 }
7656
7657 /* STMT_INFO is a dot-product reduction whose multiplication operands
7658    have different signs.  Emit a sequence to emulate the operation
7659    using a series of signed DOT_PROD_EXPRs and return the last
7660    statement generated.  VEC_DEST is the result of the vector operation
7661    and VOP lists its inputs.  */
7662
7663 static gassign *
7664 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
7665                              gimple_stmt_iterator *gsi, tree vec_dest,
7666                              tree vop[3])
7667 {
7668   tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
7669   tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
7670   tree narrow_elttype = TREE_TYPE (narrow_vectype);
7671   gimple *new_stmt;
7672
7673   /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
7674   if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
7675     std::swap (vop[0], vop[1]);
7676
7677   /* Convert all inputs to signed types.  */
7678   for (int i = 0; i < 3; ++i)
7679     if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
7680       {
7681         tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
7682         new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
7683         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7684         vop[i] = tmp;
7685       }
7686
7687   /* In the comments below we assume 8-bit inputs for simplicity,
7688      but the approach works for any full integer type.  */
7689
7690   /* Create a vector of -128.  */
7691   tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
7692   tree min_narrow = build_vector_from_val (narrow_vectype,
7693                                            min_narrow_elttype);
7694
7695   /* Create a vector of 64.  */
7696   auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
7697   tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
7698   half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
7699
7700   /* Emit: SUB_RES = VOP[0] - 128.  */
7701   tree sub_res = make_ssa_name (narrow_vectype);
7702   new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
7703   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7704
7705   /* Emit:
7706
7707        STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
7708        STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
7709        STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
7710
7711      on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
7712      Doing the two 64 * y steps first allows more time to compute x.  */
7713   tree stage1 = make_ssa_name (wide_vectype);
7714   new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
7715                                   vop[1], half_narrow, vop[2]);
7716   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7717
7718   tree stage2 = make_ssa_name (wide_vectype);
7719   new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
7720                                   vop[1], half_narrow, stage1);
7721   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7722
7723   tree stage3 = make_ssa_name (wide_vectype);
7724   new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
7725                                   sub_res, vop[1], stage2);
7726   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7727
7728   /* Convert STAGE3 to the reduction type.  */
7729   return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
7730 }
7731
7732 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7733    value.  */
7734
7735 bool
7736 vect_transform_reduction (loop_vec_info loop_vinfo,
7737                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7738                           gimple **vec_stmt, slp_tree slp_node)
7739 {
7740   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7741   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7742   int i;
7743   int ncopies;
7744   int vec_num;
7745
7746   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7747   gcc_assert (reduc_info->is_reduc_info);
7748
7749   if (nested_in_vect_loop_p (loop, stmt_info))
7750     {
7751       loop = loop->inner;
7752       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7753     }
7754
7755   gimple_match_op op;
7756   if (!gimple_extract_op (stmt_info->stmt, &op))
7757     gcc_unreachable ();
7758   gcc_assert (op.code.is_tree_code ());
7759   auto code = tree_code (op.code);
7760
7761   /* All uses but the last are expected to be defined in the loop.
7762      The last use is the reduction variable.  In case of nested cycle this
7763      assumption is not true: we use reduc_index to record the index of the
7764      reduction variable.  */
7765   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7766   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7767   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7768   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7769
7770   if (slp_node)
7771     {
7772       ncopies = 1;
7773       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7774     }
7775   else
7776     {
7777       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7778       vec_num = 1;
7779     }
7780
7781   internal_fn cond_fn = get_conditional_internal_fn (code);
7782   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7783   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7784
7785   /* Transform.  */
7786   tree new_temp = NULL_TREE;
7787   auto_vec<tree> vec_oprnds0;
7788   auto_vec<tree> vec_oprnds1;
7789   auto_vec<tree> vec_oprnds2;
7790   tree def0;
7791
7792   if (dump_enabled_p ())
7793     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7794
7795   /* FORNOW: Multiple types are not supported for condition.  */
7796   if (code == COND_EXPR)
7797     gcc_assert (ncopies == 1);
7798
7799   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7800
7801   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7802   if (reduction_type == FOLD_LEFT_REDUCTION)
7803     {
7804       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7805       return vectorize_fold_left_reduction
7806           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7807            reduc_fn, op.ops, vectype_in, reduc_index, masks);
7808     }
7809
7810   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7811   gcc_assert (single_defuse_cycle
7812               || code == DOT_PROD_EXPR
7813               || code == WIDEN_SUM_EXPR
7814               || code == SAD_EXPR);
7815
7816   /* Create the destination vector  */
7817   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
7818   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7819
7820   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7821                      single_defuse_cycle && reduc_index == 0
7822                      ? NULL_TREE : op.ops[0], &vec_oprnds0,
7823                      single_defuse_cycle && reduc_index == 1
7824                      ? NULL_TREE : op.ops[1], &vec_oprnds1,
7825                      op.num_ops == 3
7826                      && !(single_defuse_cycle && reduc_index == 2)
7827                      ? op.ops[2] : NULL_TREE, &vec_oprnds2);
7828   if (single_defuse_cycle)
7829     {
7830       gcc_assert (!slp_node);
7831       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7832                                      op.ops[reduc_index],
7833                                      reduc_index == 0 ? &vec_oprnds0
7834                                      : (reduc_index == 1 ? &vec_oprnds1
7835                                         : &vec_oprnds2));
7836     }
7837
7838   bool emulated_mixed_dot_prod
7839     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
7840   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7841     {
7842       gimple *new_stmt;
7843       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7844       if (masked_loop_p && !mask_by_cond_expr)
7845         {
7846           /* No conditional ifns have been defined for dot-product yet.  */
7847           gcc_assert (code != DOT_PROD_EXPR);
7848
7849           /* Make sure that the reduction accumulator is vop[0].  */
7850           if (reduc_index == 1)
7851             {
7852               gcc_assert (commutative_tree_code (code));
7853               std::swap (vop[0], vop[1]);
7854             }
7855           tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7856                                           vectype_in, i);
7857           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7858                                                     vop[0], vop[1], vop[0]);
7859           new_temp = make_ssa_name (vec_dest, call);
7860           gimple_call_set_lhs (call, new_temp);
7861           gimple_call_set_nothrow (call, true);
7862           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7863           new_stmt = call;
7864         }
7865       else
7866         {
7867           if (op.num_ops == 3)
7868             vop[2] = vec_oprnds2[i];
7869
7870           if (masked_loop_p && mask_by_cond_expr)
7871             {
7872               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7873                                               vectype_in, i);
7874               build_vect_cond_expr (code, vop, mask, gsi);
7875             }
7876
7877           if (emulated_mixed_dot_prod)
7878             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
7879                                                     vec_dest, vop);
7880           else
7881             new_stmt = gimple_build_assign (vec_dest, code,
7882                                             vop[0], vop[1], vop[2]);
7883           new_temp = make_ssa_name (vec_dest, new_stmt);
7884           gimple_assign_set_lhs (new_stmt, new_temp);
7885           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7886         }
7887
7888       if (slp_node)
7889         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7890       else if (single_defuse_cycle
7891                && i < ncopies - 1)
7892         {
7893           if (reduc_index == 0)
7894             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7895           else if (reduc_index == 1)
7896             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7897           else if (reduc_index == 2)
7898             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7899         }
7900       else
7901         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7902     }
7903
7904   if (!slp_node)
7905     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7906
7907   return true;
7908 }
7909
7910 /* Transform phase of a cycle PHI.  */
7911
7912 bool
7913 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7914                           stmt_vec_info stmt_info, gimple **vec_stmt,
7915                           slp_tree slp_node, slp_instance slp_node_instance)
7916 {
7917   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7918   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7919   int i;
7920   int ncopies;
7921   int j;
7922   bool nested_cycle = false;
7923   int vec_num;
7924
7925   if (nested_in_vect_loop_p (loop, stmt_info))
7926     {
7927       loop = loop->inner;
7928       nested_cycle = true;
7929     }
7930
7931   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7932   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7933   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7934   gcc_assert (reduc_info->is_reduc_info);
7935
7936   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7937       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7938     /* Leave the scalar phi in place.  */
7939     return true;
7940
7941   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7942   /* For a nested cycle we do not fill the above.  */
7943   if (!vectype_in)
7944     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7945   gcc_assert (vectype_in);
7946
7947   if (slp_node)
7948     {
7949       /* The size vect_schedule_slp_instance computes is off for us.  */
7950       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7951                                       * SLP_TREE_LANES (slp_node), vectype_in);
7952       ncopies = 1;
7953     }
7954   else
7955     {
7956       vec_num = 1;
7957       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7958     }
7959
7960   /* Check whether we should use a single PHI node and accumulate
7961      vectors to one before the backedge.  */
7962   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7963     ncopies = 1;
7964
7965   /* Create the destination vector  */
7966   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7967   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7968                                                vectype_out);
7969
7970   /* Get the loop-entry arguments.  */
7971   tree vec_initial_def = NULL_TREE;
7972   auto_vec<tree> vec_initial_defs;
7973   if (slp_node)
7974     {
7975       vec_initial_defs.reserve (vec_num);
7976       if (nested_cycle)
7977         {
7978           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7979           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7980                              &vec_initial_defs);
7981         }
7982       else
7983         {
7984           gcc_assert (slp_node == slp_node_instance->reduc_phis);
7985           vec<tree> &initial_values = reduc_info->reduc_initial_values;
7986           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
7987
7988           unsigned int num_phis = stmts.length ();
7989           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
7990             num_phis = 1;
7991           initial_values.reserve (num_phis);
7992           for (unsigned int i = 0; i < num_phis; ++i)
7993             {
7994               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
7995               initial_values.quick_push (vect_phi_initial_value (this_phi));
7996             }
7997           if (vec_num == 1)
7998             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7999           if (!initial_values.is_empty ())
8000             {
8001               tree initial_value
8002                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8003               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8004               tree neutral_op
8005                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8006                                             code, initial_value);
8007               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8008                                               &vec_initial_defs, vec_num,
8009                                               stmts.length (), neutral_op);
8010             }
8011         }
8012     }
8013   else
8014     {
8015       /* Get at the scalar def before the loop, that defines the initial
8016          value of the reduction variable.  */
8017       tree initial_def = vect_phi_initial_value (phi);
8018       reduc_info->reduc_initial_values.safe_push (initial_def);
8019       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8020          and we can't use zero for induc_val, use initial_def.  Similarly
8021          for REDUC_MIN and initial_def larger than the base.  */
8022       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8023         {
8024           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8025           if (TREE_CODE (initial_def) == INTEGER_CST
8026               && !integer_zerop (induc_val)
8027               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8028                    && tree_int_cst_lt (initial_def, induc_val))
8029                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8030                       && tree_int_cst_lt (induc_val, initial_def))))
8031             {
8032               induc_val = initial_def;
8033               /* Communicate we used the initial_def to epilouge
8034                  generation.  */
8035               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8036             }
8037           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8038         }
8039       else if (nested_cycle)
8040         {
8041           /* Do not use an adjustment def as that case is not supported
8042              correctly if ncopies is not one.  */
8043           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8044                                          ncopies, initial_def,
8045                                          &vec_initial_defs);
8046         }
8047       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8048                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8049         /* Fill the initial vector with the initial scalar value.  */
8050         vec_initial_def
8051           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8052                                            initial_def, initial_def);
8053       else
8054         {
8055           if (ncopies == 1)
8056             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8057           if (!reduc_info->reduc_initial_values.is_empty ())
8058             {
8059               initial_def = reduc_info->reduc_initial_values[0];
8060               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8061               tree neutral_op
8062                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8063                                             code, initial_def);
8064               gcc_assert (neutral_op);
8065               /* Try to simplify the vector initialization by applying an
8066                  adjustment after the reduction has been performed.  */
8067               if (!reduc_info->reused_accumulator
8068                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8069                   && !operand_equal_p (neutral_op, initial_def))
8070                 {
8071                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8072                     = initial_def;
8073                   initial_def = neutral_op;
8074                 }
8075               vec_initial_def
8076                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8077                                                  initial_def, neutral_op);
8078             }
8079         }
8080     }
8081
8082   if (vec_initial_def)
8083     {
8084       vec_initial_defs.create (ncopies);
8085       for (i = 0; i < ncopies; ++i)
8086         vec_initial_defs.quick_push (vec_initial_def);
8087     }
8088
8089   if (auto *accumulator = reduc_info->reused_accumulator)
8090     {
8091       tree def = accumulator->reduc_input;
8092       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8093         {
8094           unsigned int nreduc;
8095           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8096                                             (TREE_TYPE (def)),
8097                                           TYPE_VECTOR_SUBPARTS (vectype_out),
8098                                           &nreduc);
8099           gcc_assert (res);
8100           gimple_seq stmts = NULL;
8101           /* Reduce the single vector to a smaller one.  */
8102           if (nreduc != 1)
8103             {
8104               /* Perform the reduction in the appropriate type.  */
8105               tree rvectype = vectype_out;
8106               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8107                                               TREE_TYPE (TREE_TYPE (def))))
8108                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8109                                               TYPE_VECTOR_SUBPARTS
8110                                                 (vectype_out));
8111               def = vect_create_partial_epilog (def, rvectype,
8112                                                 STMT_VINFO_REDUC_CODE
8113                                                   (reduc_info),
8114                                                 &stmts);
8115             }
8116           /* The epilogue loop might use a different vector mode, like
8117              VNx2DI vs. V2DI.  */
8118           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8119             {
8120               tree reduc_type = build_vector_type_for_mode
8121                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8122               def = gimple_convert (&stmts, reduc_type, def);
8123             }
8124           /* Adjust the input so we pick up the partially reduced value
8125              for the skip edge in vect_create_epilog_for_reduction.  */
8126           accumulator->reduc_input = def;
8127           /* And the reduction could be carried out using a different sign.  */
8128           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8129             def = gimple_convert (&stmts, vectype_out, def);
8130           if (loop_vinfo->main_loop_edge)
8131             {
8132               /* While we'd like to insert on the edge this will split
8133                  blocks and disturb bookkeeping, we also will eventually
8134                  need this on the skip edge.  Rely on sinking to
8135                  fixup optimal placement and insert in the pred.  */
8136               gimple_stmt_iterator gsi
8137                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8138               /* Insert before a cond that eventually skips the
8139                  epilogue.  */
8140               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8141                 gsi_prev (&gsi);
8142               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8143             }
8144           else
8145             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8146                                               stmts);
8147         }
8148       if (loop_vinfo->main_loop_edge)
8149         vec_initial_defs[0]
8150           = vect_get_main_loop_result (loop_vinfo, def,
8151                                        vec_initial_defs[0]);
8152       else
8153         vec_initial_defs.safe_push (def);
8154     }
8155
8156   /* Generate the reduction PHIs upfront.  */
8157   for (i = 0; i < vec_num; i++)
8158     {
8159       tree vec_init_def = vec_initial_defs[i];
8160       for (j = 0; j < ncopies; j++)
8161         {
8162           /* Create the reduction-phi that defines the reduction
8163              operand.  */
8164           gphi *new_phi = create_phi_node (vec_dest, loop->header);
8165
8166           /* Set the loop-entry arg of the reduction-phi.  */
8167           if (j != 0 && nested_cycle)
8168             vec_init_def = vec_initial_defs[j];
8169           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8170                        UNKNOWN_LOCATION);
8171
8172           /* The loop-latch arg is set in epilogue processing.  */
8173
8174           if (slp_node)
8175             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
8176           else
8177             {
8178               if (j == 0)
8179                 *vec_stmt = new_phi;
8180               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8181             }
8182         }
8183     }
8184
8185   return true;
8186 }
8187
8188 /* Vectorizes LC PHIs.  */
8189
8190 bool
8191 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8192                      stmt_vec_info stmt_info, gimple **vec_stmt,
8193                      slp_tree slp_node)
8194 {
8195   if (!loop_vinfo
8196       || !is_a <gphi *> (stmt_info->stmt)
8197       || gimple_phi_num_args (stmt_info->stmt) != 1)
8198     return false;
8199
8200   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8201       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8202     return false;
8203
8204   if (!vec_stmt) /* transformation not required.  */
8205     {
8206       /* Deal with copies from externs or constants that disguise as
8207          loop-closed PHI nodes (PR97886).  */
8208       if (slp_node
8209           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8210                                                 SLP_TREE_VECTYPE (slp_node)))
8211         {
8212           if (dump_enabled_p ())
8213             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8214                              "incompatible vector types for invariants\n");
8215           return false;
8216         }
8217       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8218       return true;
8219     }
8220
8221   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8222   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8223   basic_block bb = gimple_bb (stmt_info->stmt);
8224   edge e = single_pred_edge (bb);
8225   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8226   auto_vec<tree> vec_oprnds;
8227   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8228                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8229                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8230   for (unsigned i = 0; i < vec_oprnds.length (); i++)
8231     {
8232       /* Create the vectorized LC PHI node.  */
8233       gphi *new_phi = create_phi_node (vec_dest, bb);
8234       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8235       if (slp_node)
8236         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
8237       else
8238         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8239     }
8240   if (!slp_node)
8241     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8242
8243   return true;
8244 }
8245
8246 /* Vectorizes PHIs.  */
8247
8248 bool
8249 vectorizable_phi (vec_info *,
8250                   stmt_vec_info stmt_info, gimple **vec_stmt,
8251                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8252 {
8253   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8254     return false;
8255
8256   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8257     return false;
8258
8259   tree vectype = SLP_TREE_VECTYPE (slp_node);
8260
8261   if (!vec_stmt) /* transformation not required.  */
8262     {
8263       slp_tree child;
8264       unsigned i;
8265       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8266         if (!child)
8267           {
8268             if (dump_enabled_p ())
8269               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8270                                "PHI node with unvectorized backedge def\n");
8271             return false;
8272           }
8273         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8274           {
8275             if (dump_enabled_p ())
8276               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8277                                "incompatible vector types for invariants\n");
8278             return false;
8279           }
8280         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8281                  && !useless_type_conversion_p (vectype,
8282                                                 SLP_TREE_VECTYPE (child)))
8283           {
8284             /* With bools we can have mask and non-mask precision vectors
8285                or different non-mask precisions.  while pattern recog is
8286                supposed to guarantee consistency here bugs in it can cause
8287                mismatches (PR103489 and PR103800 for example).
8288                Deal with them here instead of ICEing later.  */
8289             if (dump_enabled_p ())
8290               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8291                                "incompatible vector type setup from "
8292                                "bool pattern detection\n");
8293             return false;
8294           }
8295
8296       /* For single-argument PHIs assume coalescing which means zero cost
8297          for the scalar and the vector PHIs.  This avoids artificially
8298          favoring the vector path (but may pessimize it in some cases).  */
8299       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8300         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8301                           vector_stmt, stmt_info, vectype, 0, vect_body);
8302       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8303       return true;
8304     }
8305
8306   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8307   basic_block bb = gimple_bb (stmt_info->stmt);
8308   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8309   auto_vec<gphi *> new_phis;
8310   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8311     {
8312       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8313
8314       /* Skip not yet vectorized defs.  */
8315       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8316           && SLP_TREE_VEC_STMTS (child).is_empty ())
8317         continue;
8318
8319       auto_vec<tree> vec_oprnds;
8320       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8321       if (!new_phis.exists ())
8322         {
8323           new_phis.create (vec_oprnds.length ());
8324           for (unsigned j = 0; j < vec_oprnds.length (); j++)
8325             {
8326               /* Create the vectorized LC PHI node.  */
8327               new_phis.quick_push (create_phi_node (vec_dest, bb));
8328               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
8329             }
8330         }
8331       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8332       for (unsigned j = 0; j < vec_oprnds.length (); j++)
8333         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8334     }
8335   /* We should have at least one already vectorized child.  */
8336   gcc_assert (new_phis.exists ());
8337
8338   return true;
8339 }
8340
8341 /* Vectorizes first order recurrences.  An overview of the transformation
8342    is described below. Suppose we have the following loop.
8343
8344      int t = 0;
8345      for (int i = 0; i < n; ++i)
8346        {
8347          b[i] = a[i] - t;
8348          t = a[i];
8349        }
8350
8351    There is a first-order recurrence on 'a'. For this loop, the scalar IR
8352    looks (simplified) like:
8353
8354     scalar.preheader:
8355       init = 0;
8356
8357     scalar.body:
8358       i = PHI <0(scalar.preheader), i+1(scalar.body)>
8359       _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8360       _1 = a[i]
8361       b[i] = _1 - _2
8362       if (i < n) goto scalar.body
8363
8364    In this example, _2 is a recurrence because it's value depends on the
8365    previous iteration.  We vectorize this as (VF = 4)
8366
8367     vector.preheader:
8368       vect_init = vect_cst(..., ..., ..., 0)
8369
8370     vector.body
8371       i = PHI <0(vector.preheader), i+4(vector.body)>
8372       vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8373       vect_2 = a[i, i+1, i+2, i+3];
8374       vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8375       b[i, i+1, i+2, i+3] = vect_2 - vect_3
8376       if (..) goto vector.body
8377
8378    In this function, vectorizable_recurr, we code generate both the
8379    vector PHI node and the permute since those together compute the
8380    vectorized value of the scalar PHI.  We do not yet have the
8381    backedge value to fill in there nor into the vec_perm.  Those
8382    are filled in maybe_set_vectorized_backedge_value and
8383    vect_schedule_scc.
8384
8385    TODO:  Since the scalar loop does not have a use of the recurrence
8386    outside of the loop the natural way to implement peeling via
8387    vectorizing the live value doesn't work.  For now peeling of loops
8388    with a recurrence is not implemented.  For SLP the supported cases
8389    are restricted to those requiring a single vector recurrence PHI.  */
8390
8391 bool
8392 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8393                      gimple **vec_stmt, slp_tree slp_node,
8394                      stmt_vector_for_cost *cost_vec)
8395 {
8396   if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8397     return false;
8398
8399   gphi *phi = as_a<gphi *> (stmt_info->stmt);
8400
8401   /* So far we only support first-order recurrence auto-vectorization.  */
8402   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8403     return false;
8404
8405   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8406   unsigned ncopies;
8407   if (slp_node)
8408     ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8409   else
8410     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8411   poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8412   unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
8413   /* We need to be able to make progress with a single vector.  */
8414   if (maybe_gt (dist * 2, nunits))
8415     {
8416       if (dump_enabled_p ())
8417         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8418                          "first order recurrence exceeds half of "
8419                          "a vector\n");
8420       return false;
8421     }
8422
8423   /* First-order recurrence autovectorization needs to handle permutation
8424      with indices = [nunits-1, nunits, nunits+1, ...].  */
8425   vec_perm_builder sel (nunits, 1, 3);
8426   for (int i = 0; i < 3; ++i)
8427     sel.quick_push (nunits - dist + i);
8428   vec_perm_indices indices (sel, 2, nunits);
8429
8430   if (!vec_stmt) /* transformation not required.  */
8431     {
8432       if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8433                                  indices))
8434         return false;
8435
8436       if (slp_node)
8437         {
8438           /* We eventually need to set a vector type on invariant
8439              arguments.  */
8440           unsigned j;
8441           slp_tree child;
8442           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8443             if (!vect_maybe_update_slp_op_vectype
8444                   (child, SLP_TREE_VECTYPE (slp_node)))
8445               {
8446                 if (dump_enabled_p ())
8447                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8448                                    "incompatible vector types for "
8449                                    "invariants\n");
8450                 return false;
8451               }
8452         }
8453       /* The recurrence costs the initialization vector and one permute
8454          for each copy.  */
8455       unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
8456                                                  stmt_info, 0, vect_prologue);
8457       unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8458                                                stmt_info, 0, vect_body);
8459       if (dump_enabled_p ())
8460         dump_printf_loc (MSG_NOTE, vect_location,
8461                          "vectorizable_recurr: inside_cost = %d, "
8462                          "prologue_cost = %d .\n", inside_cost,
8463                          prologue_cost);
8464
8465       STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
8466       return true;
8467     }
8468
8469   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8470   basic_block bb = gimple_bb (phi);
8471   tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8472   if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
8473     {
8474       gimple_seq stmts = NULL;
8475       preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
8476       gsi_insert_seq_on_edge_immediate (pe, stmts);
8477     }
8478   tree vec_init = build_vector_from_val (vectype, preheader);
8479   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
8480
8481   /* Create the vectorized first-order PHI node.  */
8482   tree vec_dest = vect_get_new_vect_var (vectype,
8483                                          vect_simple_var, "vec_recur_");
8484   gphi *new_phi = create_phi_node (vec_dest, bb);
8485   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
8486
8487   /* Insert shuffles the first-order recurrence autovectorization.
8488        result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>.  */
8489   tree perm = vect_gen_perm_mask_checked (vectype, indices);
8490
8491   /* Insert the required permute after the latch definition.  The
8492      second and later operands are tentative and will be updated when we have
8493      vectorized the latch definition.  */
8494   edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8495   gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
8496   gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
8497   gsi_next (&gsi2);
8498
8499   for (unsigned i = 0; i < ncopies; ++i)
8500     {
8501       vec_dest = make_ssa_name (vectype);
8502       gassign *vperm
8503           = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
8504                                  i == 0 ? gimple_phi_result (new_phi) : NULL,
8505                                  NULL, perm);
8506       vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
8507
8508       if (slp_node)
8509         SLP_TREE_VEC_STMTS (slp_node).quick_push (vperm);
8510       else
8511         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
8512     }
8513
8514   if (!slp_node)
8515     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8516   return true;
8517 }
8518
8519 /* Return true if VECTYPE represents a vector that requires lowering
8520    by the vector lowering pass.  */
8521
8522 bool
8523 vect_emulated_vector_p (tree vectype)
8524 {
8525   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8526           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8527               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8528 }
8529
8530 /* Return true if we can emulate CODE on an integer mode representation
8531    of a vector.  */
8532
8533 bool
8534 vect_can_vectorize_without_simd_p (tree_code code)
8535 {
8536   switch (code)
8537     {
8538     case PLUS_EXPR:
8539     case MINUS_EXPR:
8540     case NEGATE_EXPR:
8541     case BIT_AND_EXPR:
8542     case BIT_IOR_EXPR:
8543     case BIT_XOR_EXPR:
8544     case BIT_NOT_EXPR:
8545       return true;
8546
8547     default:
8548       return false;
8549     }
8550 }
8551
8552 /* Likewise, but taking a code_helper.  */
8553
8554 bool
8555 vect_can_vectorize_without_simd_p (code_helper code)
8556 {
8557   return (code.is_tree_code ()
8558           && vect_can_vectorize_without_simd_p (tree_code (code)));
8559 }
8560
8561 /* Create vector init for vectorized iv.  */
8562 static tree
8563 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8564                                tree step_expr, poly_uint64 nunits,
8565                                tree vectype,
8566                                enum vect_induction_op_type induction_type)
8567 {
8568   unsigned HOST_WIDE_INT const_nunits;
8569   tree vec_shift, vec_init, new_name;
8570   unsigned i;
8571   tree itype = TREE_TYPE (vectype);
8572
8573   /* iv_loop is the loop to be vectorized. Create:
8574      vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr).  */
8575   new_name = gimple_convert (stmts, itype, init_expr);
8576   switch (induction_type)
8577     {
8578     case vect_step_op_shr:
8579     case vect_step_op_shl:
8580       /* Build the Initial value from shift_expr.  */
8581       vec_init = gimple_build_vector_from_val (stmts,
8582                                                vectype,
8583                                                new_name);
8584       vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
8585                                 build_zero_cst (itype), step_expr);
8586       vec_init = gimple_build (stmts,
8587                                (induction_type == vect_step_op_shr
8588                                 ? RSHIFT_EXPR : LSHIFT_EXPR),
8589                                vectype, vec_init, vec_shift);
8590       break;
8591
8592     case vect_step_op_neg:
8593       {
8594         vec_init = gimple_build_vector_from_val (stmts,
8595                                                  vectype,
8596                                                  new_name);
8597         tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
8598                                      vectype, vec_init);
8599         /* The encoding has 2 interleaved stepped patterns.  */
8600         vec_perm_builder sel (nunits, 2, 3);
8601         sel.quick_grow (6);
8602         for (i = 0; i < 3; i++)
8603           {
8604             sel[2 * i] = i;
8605             sel[2 * i + 1] = i + nunits;
8606           }
8607         vec_perm_indices indices (sel, 2, nunits);
8608         /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
8609            fail when vec_init is const vector. In that situation vec_perm is not
8610            really needed.  */
8611         tree perm_mask_even
8612           = vect_gen_perm_mask_any (vectype, indices);
8613         vec_init = gimple_build (stmts, VEC_PERM_EXPR,
8614                                  vectype,
8615                                  vec_init, vec_neg,
8616                                  perm_mask_even);
8617       }
8618       break;
8619
8620     case vect_step_op_mul:
8621       {
8622         /* Use unsigned mult to avoid UD integer overflow.  */
8623         gcc_assert (nunits.is_constant (&const_nunits));
8624         tree utype = unsigned_type_for (itype);
8625         tree uvectype = build_vector_type (utype,
8626                                            TYPE_VECTOR_SUBPARTS (vectype));
8627         new_name = gimple_convert (stmts, utype, new_name);
8628         vec_init = gimple_build_vector_from_val (stmts,
8629                                                  uvectype,
8630                                                  new_name);
8631         tree_vector_builder elts (uvectype, const_nunits, 1);
8632         tree elt_step = build_one_cst (utype);
8633
8634         elts.quick_push (elt_step);
8635         for (i = 1; i < const_nunits; i++)
8636           {
8637             /* Create: new_name_i = new_name + step_expr.  */
8638             elt_step = gimple_build (stmts, MULT_EXPR,
8639                                      utype, elt_step, step_expr);
8640             elts.quick_push (elt_step);
8641           }
8642         /* Create a vector from [new_name_0, new_name_1, ...,
8643            new_name_nunits-1].  */
8644         tree vec_mul = gimple_build_vector (stmts, &elts);
8645         vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
8646                                  vec_init, vec_mul);
8647         vec_init = gimple_convert (stmts, vectype, vec_init);
8648       }
8649       break;
8650
8651     default:
8652       gcc_unreachable ();
8653     }
8654
8655   return vec_init;
8656 }
8657
8658 /* Peel init_expr by skip_niter for induction_type.  */
8659 tree
8660 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8661                              tree skip_niters, tree step_expr,
8662                              enum vect_induction_op_type induction_type)
8663 {
8664   gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
8665   tree type = TREE_TYPE (init_expr);
8666   unsigned prec = TYPE_PRECISION (type);
8667   switch (induction_type)
8668     {
8669     case vect_step_op_neg:
8670       if (TREE_INT_CST_LOW (skip_niters) % 2)
8671         init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
8672       /* else no change.  */
8673       break;
8674
8675     case vect_step_op_shr:
8676     case vect_step_op_shl:
8677       skip_niters = gimple_convert (stmts, type, skip_niters);
8678       step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
8679       /* When shift mount >= precision, need to avoid UD.
8680          In the original loop, there's no UD, and according to semantic,
8681          init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr.  */
8682       if (!tree_fits_uhwi_p (step_expr)
8683           || tree_to_uhwi (step_expr) >= prec)
8684         {
8685           if (induction_type == vect_step_op_shl
8686               || TYPE_UNSIGNED (type))
8687             init_expr = build_zero_cst (type);
8688           else
8689             init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
8690                                       init_expr,
8691                                       wide_int_to_tree (type, prec - 1));
8692         }
8693       else
8694         init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
8695                                           ? RSHIFT_EXPR : LSHIFT_EXPR),
8696                                   type, init_expr, step_expr);
8697       break;
8698
8699     case vect_step_op_mul:
8700       {
8701         tree utype = unsigned_type_for (type);
8702         init_expr = gimple_convert (stmts, utype, init_expr);
8703         unsigned skipn = TREE_INT_CST_LOW (skip_niters);
8704         wide_int begin = wi::to_wide (step_expr);
8705         for (unsigned i = 0; i != skipn - 1; i++)
8706           begin = wi::mul (begin, wi::to_wide (step_expr));
8707         tree mult_expr = wide_int_to_tree (utype, begin);
8708         init_expr = gimple_build (stmts, MULT_EXPR, utype, init_expr, mult_expr);
8709         init_expr = gimple_convert (stmts, type, init_expr);
8710       }
8711       break;
8712
8713     default:
8714       gcc_unreachable ();
8715     }
8716
8717   return init_expr;
8718 }
8719
8720 /* Create vector step for vectorized iv.  */
8721 static tree
8722 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
8723                                poly_uint64 vf,
8724                                enum vect_induction_op_type induction_type)
8725 {
8726   tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
8727   tree new_name = NULL;
8728   /* Step should be pow (step, vf) for mult induction.  */
8729   if (induction_type == vect_step_op_mul)
8730     {
8731       gcc_assert (vf.is_constant ());
8732       wide_int begin = wi::to_wide (step_expr);
8733
8734       for (unsigned i = 0; i != vf.to_constant () - 1; i++)
8735         begin = wi::mul (begin, wi::to_wide (step_expr));
8736
8737       new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
8738     }
8739   else if (induction_type == vect_step_op_neg)
8740     /* Do nothing.  */
8741     ;
8742   else
8743     new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
8744                              expr, step_expr);
8745   return new_name;
8746 }
8747
8748 static tree
8749 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
8750                                    stmt_vec_info stmt_info,
8751                                    tree new_name, tree vectype,
8752                                    enum vect_induction_op_type induction_type)
8753 {
8754   /* No step is needed for neg induction.  */
8755   if (induction_type == vect_step_op_neg)
8756     return NULL;
8757
8758   tree t = unshare_expr (new_name);
8759   gcc_assert (CONSTANT_CLASS_P (new_name)
8760               || TREE_CODE (new_name) == SSA_NAME);
8761   tree new_vec = build_vector_from_val (vectype, t);
8762   tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
8763                                     new_vec, vectype, NULL);
8764   return vec_step;
8765 }
8766
8767 /* Update vectorized iv with vect_step, induc_def is init.  */
8768 static tree
8769 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
8770                           tree induc_def, tree vec_step,
8771                           enum vect_induction_op_type induction_type)
8772 {
8773   tree vec_def = induc_def;
8774   switch (induction_type)
8775     {
8776     case vect_step_op_mul:
8777       {
8778         /* Use unsigned mult to avoid UD integer overflow.  */
8779         tree uvectype
8780           = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
8781                                TYPE_VECTOR_SUBPARTS (vectype));
8782         vec_def = gimple_convert (stmts, uvectype, vec_def);
8783         vec_step = gimple_convert (stmts, uvectype, vec_step);
8784         vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
8785                                 vec_def, vec_step);
8786         vec_def = gimple_convert (stmts, vectype, vec_def);
8787       }
8788       break;
8789
8790     case vect_step_op_shr:
8791       vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
8792                               vec_def, vec_step);
8793       break;
8794
8795     case vect_step_op_shl:
8796       vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
8797                               vec_def, vec_step);
8798       break;
8799     case vect_step_op_neg:
8800       vec_def = induc_def;
8801       /* Do nothing.  */
8802       break;
8803     default:
8804       gcc_unreachable ();
8805     }
8806
8807   return vec_def;
8808
8809 }
8810
8811 /* Return true if vectorizer can peel for nonlinear iv.  */
8812 bool
8813 vect_can_peel_nonlinear_iv_p (loop_vec_info loop_vinfo,
8814                               enum vect_induction_op_type induction_type)
8815 {
8816   tree niters_skip;
8817   /* Init_expr will be update by vect_update_ivs_after_vectorizer,
8818      if niters is unkown:
8819      For shift, when shift mount >= precision, there would be UD.
8820      For mult, don't known how to generate
8821      init_expr * pow (step, niters) for variable niters.
8822      For neg, it should be ok, since niters of vectorized main loop
8823      will always be multiple of 2.  */
8824   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8825       && induction_type != vect_step_op_neg)
8826     {
8827       if (dump_enabled_p ())
8828         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8829                          "Peeling for epilogue is not supported"
8830                          " for nonlinear induction except neg"
8831                          " when iteration count is unknown.\n");
8832       return false;
8833     }
8834
8835   /* Also doens't support peel for neg when niter is variable.
8836      ??? generate something like niter_expr & 1 ? init_expr : -init_expr?  */
8837   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8838   if ((niters_skip != NULL_TREE
8839        && TREE_CODE (niters_skip) != INTEGER_CST)
8840       || (!vect_use_loop_mask_for_alignment_p (loop_vinfo)
8841           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0))
8842     {
8843       if (dump_enabled_p ())
8844         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8845                          "Peeling for alignement is not supported"
8846                          " for nonlinear induction when niters_skip"
8847                          " is not constant.\n");
8848       return false;
8849     }
8850
8851   return true;
8852 }
8853
8854 /* Function vectorizable_induction
8855
8856    Check if STMT_INFO performs an nonlinear induction computation that can be
8857    vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
8858    a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
8859    basic block.
8860    Return true if STMT_INFO is vectorizable in this way.  */
8861
8862 static bool
8863 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
8864                                   stmt_vec_info stmt_info,
8865                                   gimple **vec_stmt, slp_tree slp_node,
8866                                   stmt_vector_for_cost *cost_vec)
8867 {
8868   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8869   unsigned ncopies;
8870   bool nested_in_vect_loop = false;
8871   class loop *iv_loop;
8872   tree vec_def;
8873   edge pe = loop_preheader_edge (loop);
8874   basic_block new_bb;
8875   tree vec_init, vec_step;
8876   tree new_name;
8877   gimple *new_stmt;
8878   gphi *induction_phi;
8879   tree induc_def, vec_dest;
8880   tree init_expr, step_expr;
8881   tree niters_skip;
8882   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8883   unsigned i;
8884   gimple_stmt_iterator si;
8885
8886   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
8887
8888   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8889   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8890   enum vect_induction_op_type induction_type
8891     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
8892
8893   gcc_assert (induction_type > vect_step_op_add);
8894
8895   if (slp_node)
8896     ncopies = 1;
8897   else
8898     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8899   gcc_assert (ncopies >= 1);
8900
8901   /* FORNOW. Only handle nonlinear induction in the same loop.  */
8902   if (nested_in_vect_loop_p (loop, stmt_info))
8903     {
8904       if (dump_enabled_p ())
8905         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8906                          "nonlinear induction in nested loop.\n");
8907       return false;
8908     }
8909
8910   iv_loop = loop;
8911   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
8912
8913   /* TODO: Support slp for nonlinear iv. There should be separate vector iv
8914      update for each iv and a permutation to generate wanted vector iv.  */
8915   if (slp_node)
8916     {
8917       if (dump_enabled_p ())
8918         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8919                          "SLP induction not supported for nonlinear"
8920                          " induction.\n");
8921       return false;
8922     }
8923
8924   if (!vect_can_peel_nonlinear_iv_p (loop_vinfo, induction_type))
8925     return false;
8926
8927   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
8928     {
8929       if (dump_enabled_p ())
8930         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8931                          "floating point nonlinear induction vectorization"
8932                          " not supported.\n");
8933       return false;
8934     }
8935
8936   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8937   init_expr = vect_phi_initial_value (phi);
8938   gcc_assert (step_expr != NULL_TREE && init_expr != NULL
8939               && TREE_CODE (step_expr) == INTEGER_CST);
8940   /* step_expr should be aligned with init_expr,
8941      .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used.  */
8942   step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
8943
8944   if (TREE_CODE (init_expr) == INTEGER_CST)
8945     init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
8946   else
8947     gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
8948                                        TREE_TYPE (init_expr)));
8949
8950   switch (induction_type)
8951     {
8952     case vect_step_op_neg:
8953       if (TREE_CODE (init_expr) != INTEGER_CST
8954           && TREE_CODE (init_expr) != REAL_CST)
8955         {
8956           /* Check for backend support of NEGATE_EXPR and vec_perm.  */
8957           if (!directly_supported_p (NEGATE_EXPR, vectype))
8958             return false;
8959
8960           /* The encoding has 2 interleaved stepped patterns.  */
8961           vec_perm_builder sel (nunits, 2, 3);
8962           machine_mode mode = TYPE_MODE (vectype);
8963           sel.quick_grow (6);
8964           for (i = 0; i < 3; i++)
8965             {
8966               sel[i * 2] = i;
8967               sel[i * 2 + 1] = i + nunits;
8968             }
8969           vec_perm_indices indices (sel, 2, nunits);
8970           if (!can_vec_perm_const_p (mode, mode, indices))
8971             return false;
8972         }
8973       break;
8974
8975     case vect_step_op_mul:
8976       {
8977         /* Check for backend support of MULT_EXPR.  */
8978         if (!directly_supported_p (MULT_EXPR, vectype))
8979           return false;
8980
8981         /* ?? How to construct vector step for variable number vector.
8982            [ 1, step, pow (step, 2), pow (step, 4), .. ].  */
8983         if (!vf.is_constant ())
8984           return false;
8985       }
8986       break;
8987
8988     case vect_step_op_shr:
8989       /* Check for backend support of RSHIFT_EXPR.  */
8990       if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
8991         return false;
8992
8993       /* Don't shift more than type precision to avoid UD.  */
8994       if (!tree_fits_uhwi_p (step_expr)
8995           || maybe_ge (nunits * tree_to_uhwi (step_expr),
8996                        TYPE_PRECISION (TREE_TYPE (init_expr))))
8997         return false;
8998       break;
8999
9000     case vect_step_op_shl:
9001       /* Check for backend support of RSHIFT_EXPR.  */
9002       if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9003         return false;
9004
9005       /* Don't shift more than type precision to avoid UD.  */
9006       if (!tree_fits_uhwi_p (step_expr)
9007           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9008                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9009         return false;
9010
9011       break;
9012
9013     default:
9014       gcc_unreachable ();
9015     }
9016
9017   if (!vec_stmt) /* transformation not required.  */
9018     {
9019       unsigned inside_cost = 0, prologue_cost = 0;
9020       /* loop cost for vec_loop. Neg induction doesn't have any
9021          inside_cost.  */
9022       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9023                                       stmt_info, 0, vect_body);
9024
9025       /* loop cost for vec_loop. Neg induction doesn't have any
9026          inside_cost.  */
9027       if (induction_type == vect_step_op_neg)
9028         inside_cost = 0;
9029
9030       /* prologue cost for vec_init and vec_step.  */
9031       prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9032                                         stmt_info, 0, vect_prologue);
9033
9034       if (dump_enabled_p ())
9035         dump_printf_loc (MSG_NOTE, vect_location,
9036                          "vect_model_induction_cost: inside_cost = %d, "
9037                          "prologue_cost = %d. \n", inside_cost,
9038                          prologue_cost);
9039
9040       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9041       DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9042       return true;
9043     }
9044
9045   /* Transform.  */
9046
9047   /* Compute a vector variable, initialized with the first VF values of
9048      the induction variable.  E.g., for an iv with IV_PHI='X' and
9049      evolution S, for a vector of 4 units, we want to compute:
9050      [X, X + S, X + 2*S, X + 3*S].  */
9051
9052   if (dump_enabled_p ())
9053     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9054
9055   pe = loop_preheader_edge (iv_loop);
9056   /* Find the first insertion point in the BB.  */
9057   basic_block bb = gimple_bb (phi);
9058   si = gsi_after_labels (bb);
9059
9060   gimple_seq stmts = NULL;
9061
9062   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9063   /* If we are using the loop mask to "peel" for alignment then we need
9064      to adjust the start value here.  */
9065   if (niters_skip != NULL_TREE)
9066     init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9067                                              step_expr, induction_type);
9068
9069   vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9070                                             step_expr, nunits, vectype,
9071                                             induction_type);
9072   if (stmts)
9073     {
9074       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9075       gcc_assert (!new_bb);
9076     }
9077
9078   stmts = NULL;
9079   new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9080                                             vf, induction_type);
9081   if (stmts)
9082     {
9083       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9084       gcc_assert (!new_bb);
9085     }
9086
9087   vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9088                                                 new_name, vectype,
9089                                                 induction_type);
9090   /* Create the following def-use cycle:
9091      loop prolog:
9092      vec_init = ...
9093      vec_step = ...
9094      loop:
9095      vec_iv = PHI <vec_init, vec_loop>
9096      ...
9097      STMT
9098      ...
9099      vec_loop = vec_iv + vec_step;  */
9100
9101   /* Create the induction-phi that defines the induction-operand.  */
9102   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9103   induction_phi = create_phi_node (vec_dest, iv_loop->header);
9104   induc_def = PHI_RESULT (induction_phi);
9105
9106   /* Create the iv update inside the loop.  */
9107   stmts = NULL;
9108   vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9109                                       induc_def, vec_step,
9110                                       induction_type);
9111
9112   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9113   new_stmt = SSA_NAME_DEF_STMT (vec_def);
9114
9115   /* Set the arguments of the phi node:  */
9116   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9117   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9118                UNKNOWN_LOCATION);
9119
9120   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9121   *vec_stmt = induction_phi;
9122
9123   /* In case that vectorization factor (VF) is bigger than the number
9124      of elements that we can fit in a vectype (nunits), we have to generate
9125      more than one vector stmt - i.e - we need to "unroll" the
9126      vector stmt by a factor VF/nunits.  For more details see documentation
9127      in vectorizable_operation.  */
9128
9129   if (ncopies > 1)
9130     {
9131       stmts = NULL;
9132       /* FORNOW. This restriction should be relaxed.  */
9133       gcc_assert (!nested_in_vect_loop);
9134
9135       new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9136                                                 nunits, induction_type);
9137
9138       vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9139                                                     new_name, vectype,
9140                                                     induction_type);
9141       vec_def = induc_def;
9142       for (i = 1; i < ncopies; i++)
9143         {
9144           /* vec_i = vec_prev + vec_step.  */
9145           stmts = NULL;
9146           vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9147                                               vec_def, vec_step,
9148                                               induction_type);
9149           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9150           new_stmt = SSA_NAME_DEF_STMT (vec_def);
9151           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9152         }
9153     }
9154
9155   if (dump_enabled_p ())
9156     dump_printf_loc (MSG_NOTE, vect_location,
9157                      "transform induction: created def-use cycle: %G%G",
9158                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9159
9160   return true;
9161 }
9162
9163 /* Function vectorizable_induction
9164
9165    Check if STMT_INFO performs an induction computation that can be vectorized.
9166    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9167    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9168    Return true if STMT_INFO is vectorizable in this way.  */
9169
9170 bool
9171 vectorizable_induction (loop_vec_info loop_vinfo,
9172                         stmt_vec_info stmt_info,
9173                         gimple **vec_stmt, slp_tree slp_node,
9174                         stmt_vector_for_cost *cost_vec)
9175 {
9176   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9177   unsigned ncopies;
9178   bool nested_in_vect_loop = false;
9179   class loop *iv_loop;
9180   tree vec_def;
9181   edge pe = loop_preheader_edge (loop);
9182   basic_block new_bb;
9183   tree new_vec, vec_init, vec_step, t;
9184   tree new_name;
9185   gimple *new_stmt;
9186   gphi *induction_phi;
9187   tree induc_def, vec_dest;
9188   tree init_expr, step_expr;
9189   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9190   unsigned i;
9191   tree expr;
9192   gimple_stmt_iterator si;
9193   enum vect_induction_op_type induction_type
9194     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9195
9196   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9197   if (!phi)
9198     return false;
9199
9200   if (!STMT_VINFO_RELEVANT_P (stmt_info))
9201     return false;
9202
9203   /* Make sure it was recognized as induction computation.  */
9204   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9205     return false;
9206
9207   /* Handle nonlinear induction in a separate place.  */
9208   if (induction_type != vect_step_op_add)
9209     return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9210                                              vec_stmt, slp_node, cost_vec);
9211
9212   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9213   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9214
9215   if (slp_node)
9216     ncopies = 1;
9217   else
9218     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9219   gcc_assert (ncopies >= 1);
9220
9221   /* FORNOW. These restrictions should be relaxed.  */
9222   if (nested_in_vect_loop_p (loop, stmt_info))
9223     {
9224       imm_use_iterator imm_iter;
9225       use_operand_p use_p;
9226       gimple *exit_phi;
9227       edge latch_e;
9228       tree loop_arg;
9229
9230       if (ncopies > 1)
9231         {
9232           if (dump_enabled_p ())
9233             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9234                              "multiple types in nested loop.\n");
9235           return false;
9236         }
9237
9238       exit_phi = NULL;
9239       latch_e = loop_latch_edge (loop->inner);
9240       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9241       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9242         {
9243           gimple *use_stmt = USE_STMT (use_p);
9244           if (is_gimple_debug (use_stmt))
9245             continue;
9246
9247           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9248             {
9249               exit_phi = use_stmt;
9250               break;
9251             }
9252         }
9253       if (exit_phi)
9254         {
9255           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9256           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9257                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9258             {
9259               if (dump_enabled_p ())
9260                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9261                                  "inner-loop induction only used outside "
9262                                  "of the outer vectorized loop.\n");
9263               return false;
9264             }
9265         }
9266
9267       nested_in_vect_loop = true;
9268       iv_loop = loop->inner;
9269     }
9270   else
9271     iv_loop = loop;
9272   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9273
9274   if (slp_node && !nunits.is_constant ())
9275     {
9276       /* The current SLP code creates the step value element-by-element.  */
9277       if (dump_enabled_p ())
9278         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9279                          "SLP induction not supported for variable-length"
9280                          " vectors.\n");
9281       return false;
9282     }
9283
9284   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9285     {
9286       if (dump_enabled_p ())
9287         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9288                          "floating point induction vectorization disabled\n");
9289       return false;
9290     }
9291
9292   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9293   gcc_assert (step_expr != NULL_TREE);
9294   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9295
9296   /* Check for backend support of PLUS/MINUS_EXPR. */
9297   if (!directly_supported_p (PLUS_EXPR, step_vectype)
9298       || !directly_supported_p (MINUS_EXPR, step_vectype))
9299     return false;
9300
9301   if (!vec_stmt) /* transformation not required.  */
9302     {
9303       unsigned inside_cost = 0, prologue_cost = 0;
9304       if (slp_node)
9305         {
9306           /* We eventually need to set a vector type on invariant
9307              arguments.  */
9308           unsigned j;
9309           slp_tree child;
9310           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9311             if (!vect_maybe_update_slp_op_vectype
9312                 (child, SLP_TREE_VECTYPE (slp_node)))
9313               {
9314                 if (dump_enabled_p ())
9315                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9316                                    "incompatible vector types for "
9317                                    "invariants\n");
9318                 return false;
9319               }
9320           /* loop cost for vec_loop.  */
9321           inside_cost
9322             = record_stmt_cost (cost_vec,
9323                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9324                                 vector_stmt, stmt_info, 0, vect_body);
9325           /* prologue cost for vec_init (if not nested) and step.  */
9326           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9327                                             scalar_to_vec,
9328                                             stmt_info, 0, vect_prologue);
9329         }
9330       else /* if (!slp_node) */
9331         {
9332           /* loop cost for vec_loop.  */
9333           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9334                                           stmt_info, 0, vect_body);
9335           /* prologue cost for vec_init and vec_step.  */
9336           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9337                                             stmt_info, 0, vect_prologue);
9338         }
9339       if (dump_enabled_p ())
9340         dump_printf_loc (MSG_NOTE, vect_location,
9341                          "vect_model_induction_cost: inside_cost = %d, "
9342                          "prologue_cost = %d .\n", inside_cost,
9343                          prologue_cost);
9344
9345       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9346       DUMP_VECT_SCOPE ("vectorizable_induction");
9347       return true;
9348     }
9349
9350   /* Transform.  */
9351
9352   /* Compute a vector variable, initialized with the first VF values of
9353      the induction variable.  E.g., for an iv with IV_PHI='X' and
9354      evolution S, for a vector of 4 units, we want to compute:
9355      [X, X + S, X + 2*S, X + 3*S].  */
9356
9357   if (dump_enabled_p ())
9358     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9359
9360   pe = loop_preheader_edge (iv_loop);
9361   /* Find the first insertion point in the BB.  */
9362   basic_block bb = gimple_bb (phi);
9363   si = gsi_after_labels (bb);
9364
9365   /* For SLP induction we have to generate several IVs as for example
9366      with group size 3 we need
9367        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9368        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
9369   if (slp_node)
9370     {
9371       /* Enforced above.  */
9372       unsigned int const_nunits = nunits.to_constant ();
9373
9374       /* The initial values are vectorized, but any lanes > group_size
9375          need adjustment.  */
9376       slp_tree init_node
9377         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9378
9379       /* Gather steps.  Since we do not vectorize inductions as
9380          cycles we have to reconstruct the step from SCEV data.  */
9381       unsigned group_size = SLP_TREE_LANES (slp_node);
9382       tree *steps = XALLOCAVEC (tree, group_size);
9383       tree *inits = XALLOCAVEC (tree, group_size);
9384       stmt_vec_info phi_info;
9385       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9386         {
9387           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9388           if (!init_node)
9389             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9390                                            pe->dest_idx);
9391         }
9392
9393       /* Now generate the IVs.  */
9394       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9395       gcc_assert ((const_nunits * nvects) % group_size == 0);
9396       unsigned nivs;
9397       if (nested_in_vect_loop)
9398         nivs = nvects;
9399       else
9400         {
9401           /* Compute the number of distinct IVs we need.  First reduce
9402              group_size if it is a multiple of const_nunits so we get
9403              one IV for a group_size of 4 but const_nunits 2.  */
9404           unsigned group_sizep = group_size;
9405           if (group_sizep % const_nunits == 0)
9406             group_sizep = group_sizep / const_nunits;
9407           nivs = least_common_multiple (group_sizep,
9408                                         const_nunits) / const_nunits;
9409         }
9410       tree stept = TREE_TYPE (step_vectype);
9411       tree lupdate_mul = NULL_TREE;
9412       if (!nested_in_vect_loop)
9413         {
9414           /* The number of iterations covered in one vector iteration.  */
9415           unsigned lup_mul = (nvects * const_nunits) / group_size;
9416           lupdate_mul
9417             = build_vector_from_val (step_vectype,
9418                                      SCALAR_FLOAT_TYPE_P (stept)
9419                                      ? build_real_from_wide (stept, lup_mul,
9420                                                              UNSIGNED)
9421                                      : build_int_cstu (stept, lup_mul));
9422         }
9423       tree peel_mul = NULL_TREE;
9424       gimple_seq init_stmts = NULL;
9425       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9426         {
9427           if (SCALAR_FLOAT_TYPE_P (stept))
9428             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9429                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9430           else
9431             peel_mul = gimple_convert (&init_stmts, stept,
9432                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9433           peel_mul = gimple_build_vector_from_val (&init_stmts,
9434                                                    step_vectype, peel_mul);
9435         }
9436       unsigned ivn;
9437       auto_vec<tree> vec_steps;
9438       for (ivn = 0; ivn < nivs; ++ivn)
9439         {
9440           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9441           tree_vector_builder init_elts (vectype, const_nunits, 1);
9442           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9443           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9444             {
9445               /* The scalar steps of the IVs.  */
9446               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9447               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9448               step_elts.quick_push (elt);
9449               if (!init_node)
9450                 {
9451                   /* The scalar inits of the IVs if not vectorized.  */
9452                   elt = inits[(ivn*const_nunits + eltn) % group_size];
9453                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
9454                                                   TREE_TYPE (elt)))
9455                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9456                                         TREE_TYPE (vectype), elt);
9457                   init_elts.quick_push (elt);
9458                 }
9459               /* The number of steps to add to the initial values.  */
9460               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9461               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9462                                    ? build_real_from_wide (stept,
9463                                                            mul_elt, UNSIGNED)
9464                                    : build_int_cstu (stept, mul_elt));
9465             }
9466           vec_step = gimple_build_vector (&init_stmts, &step_elts);
9467           vec_steps.safe_push (vec_step);
9468           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9469           if (peel_mul)
9470             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9471                                      step_mul, peel_mul);
9472           if (!init_node)
9473             vec_init = gimple_build_vector (&init_stmts, &init_elts);
9474
9475           /* Create the induction-phi that defines the induction-operand.  */
9476           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9477                                             "vec_iv_");
9478           induction_phi = create_phi_node (vec_dest, iv_loop->header);
9479           induc_def = PHI_RESULT (induction_phi);
9480
9481           /* Create the iv update inside the loop  */
9482           tree up = vec_step;
9483           if (lupdate_mul)
9484             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9485                                vec_step, lupdate_mul);
9486           gimple_seq stmts = NULL;
9487           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9488           vec_def = gimple_build (&stmts,
9489                                   PLUS_EXPR, step_vectype, vec_def, up);
9490           vec_def = gimple_convert (&stmts, vectype, vec_def);
9491           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9492           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9493                        UNKNOWN_LOCATION);
9494
9495           if (init_node)
9496             vec_init = vect_get_slp_vect_def (init_node, ivn);
9497           if (!nested_in_vect_loop
9498               && !integer_zerop (step_mul))
9499             {
9500               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9501               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9502                                  vec_step, step_mul);
9503               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9504                                       vec_def, up);
9505               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9506             }
9507
9508           /* Set the arguments of the phi node:  */
9509           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9510
9511           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
9512         }
9513       if (!nested_in_vect_loop)
9514         {
9515           /* Fill up to the number of vectors we need for the whole group.  */
9516           nivs = least_common_multiple (group_size,
9517                                         const_nunits) / const_nunits;
9518           vec_steps.reserve (nivs-ivn);
9519           for (; ivn < nivs; ++ivn)
9520             {
9521               SLP_TREE_VEC_STMTS (slp_node)
9522                 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
9523               vec_steps.quick_push (vec_steps[0]);
9524             }
9525         }
9526
9527       /* Re-use IVs when we can.  We are generating further vector
9528          stmts by adding VF' * stride to the IVs generated above.  */
9529       if (ivn < nvects)
9530         {
9531           unsigned vfp
9532             = least_common_multiple (group_size, const_nunits) / group_size;
9533           tree lupdate_mul
9534             = build_vector_from_val (step_vectype,
9535                                      SCALAR_FLOAT_TYPE_P (stept)
9536                                      ? build_real_from_wide (stept,
9537                                                              vfp, UNSIGNED)
9538                                      : build_int_cstu (stept, vfp));
9539           for (; ivn < nvects; ++ivn)
9540             {
9541               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
9542               tree def = gimple_get_lhs (iv);
9543               if (ivn < 2*nivs)
9544                 vec_steps[ivn - nivs]
9545                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9546                                   vec_steps[ivn - nivs], lupdate_mul);
9547               gimple_seq stmts = NULL;
9548               def = gimple_convert (&stmts, step_vectype, def);
9549               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9550                                   def, vec_steps[ivn % nivs]);
9551               def = gimple_convert (&stmts, vectype, def);
9552               if (gimple_code (iv) == GIMPLE_PHI)
9553                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9554               else
9555                 {
9556                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
9557                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
9558                 }
9559               SLP_TREE_VEC_STMTS (slp_node)
9560                 .quick_push (SSA_NAME_DEF_STMT (def));
9561             }
9562         }
9563
9564       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
9565       gcc_assert (!new_bb);
9566
9567       return true;
9568     }
9569
9570   init_expr = vect_phi_initial_value (phi);
9571
9572   gimple_seq stmts = NULL;
9573   if (!nested_in_vect_loop)
9574     {
9575       /* Convert the initial value to the IV update type.  */
9576       tree new_type = TREE_TYPE (step_expr);
9577       init_expr = gimple_convert (&stmts, new_type, init_expr);
9578
9579       /* If we are using the loop mask to "peel" for alignment then we need
9580          to adjust the start value here.  */
9581       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9582       if (skip_niters != NULL_TREE)
9583         {
9584           if (FLOAT_TYPE_P (vectype))
9585             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
9586                                         skip_niters);
9587           else
9588             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
9589           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
9590                                          skip_niters, step_expr);
9591           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
9592                                     init_expr, skip_step);
9593         }
9594     }
9595
9596   if (stmts)
9597     {
9598       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9599       gcc_assert (!new_bb);
9600     }
9601
9602   /* Create the vector that holds the initial_value of the induction.  */
9603   if (nested_in_vect_loop)
9604     {
9605       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
9606          been created during vectorization of previous stmts.  We obtain it
9607          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
9608       auto_vec<tree> vec_inits;
9609       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
9610                                      init_expr, &vec_inits);
9611       vec_init = vec_inits[0];
9612       /* If the initial value is not of proper type, convert it.  */
9613       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
9614         {
9615           new_stmt
9616             = gimple_build_assign (vect_get_new_ssa_name (vectype,
9617                                                           vect_simple_var,
9618                                                           "vec_iv_"),
9619                                    VIEW_CONVERT_EXPR,
9620                                    build1 (VIEW_CONVERT_EXPR, vectype,
9621                                            vec_init));
9622           vec_init = gimple_assign_lhs (new_stmt);
9623           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
9624                                                  new_stmt);
9625           gcc_assert (!new_bb);
9626         }
9627     }
9628   else
9629     {
9630       /* iv_loop is the loop to be vectorized. Create:
9631          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
9632       stmts = NULL;
9633       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
9634
9635       unsigned HOST_WIDE_INT const_nunits;
9636       if (nunits.is_constant (&const_nunits))
9637         {
9638           tree_vector_builder elts (step_vectype, const_nunits, 1);
9639           elts.quick_push (new_name);
9640           for (i = 1; i < const_nunits; i++)
9641             {
9642               /* Create: new_name_i = new_name + step_expr  */
9643               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
9644                                        new_name, step_expr);
9645               elts.quick_push (new_name);
9646             }
9647           /* Create a vector from [new_name_0, new_name_1, ...,
9648              new_name_nunits-1]  */
9649           vec_init = gimple_build_vector (&stmts, &elts);
9650         }
9651       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
9652         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
9653         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
9654                                  new_name, step_expr);
9655       else
9656         {
9657           /* Build:
9658                 [base, base, base, ...]
9659                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
9660           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
9661           gcc_assert (flag_associative_math);
9662           tree index = build_index_vector (step_vectype, 0, 1);
9663           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
9664                                                         new_name);
9665           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
9666                                                         step_expr);
9667           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
9668           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
9669                                    vec_init, step_vec);
9670           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9671                                    vec_init, base_vec);
9672         }
9673       vec_init = gimple_convert (&stmts, vectype, vec_init);
9674
9675       if (stmts)
9676         {
9677           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9678           gcc_assert (!new_bb);
9679         }
9680     }
9681
9682
9683   /* Create the vector that holds the step of the induction.  */
9684   if (nested_in_vect_loop)
9685     /* iv_loop is nested in the loop to be vectorized. Generate:
9686        vec_step = [S, S, S, S]  */
9687     new_name = step_expr;
9688   else
9689     {
9690       /* iv_loop is the loop to be vectorized. Generate:
9691           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
9692       gimple_seq seq = NULL;
9693       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
9694         {
9695           expr = build_int_cst (integer_type_node, vf);
9696           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
9697         }
9698       else
9699         expr = build_int_cst (TREE_TYPE (step_expr), vf);
9700       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
9701                                expr, step_expr);
9702       if (seq)
9703         {
9704           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
9705           gcc_assert (!new_bb);
9706         }
9707     }
9708
9709   t = unshare_expr (new_name);
9710   gcc_assert (CONSTANT_CLASS_P (new_name)
9711               || TREE_CODE (new_name) == SSA_NAME);
9712   new_vec = build_vector_from_val (step_vectype, t);
9713   vec_step = vect_init_vector (loop_vinfo, stmt_info,
9714                                new_vec, step_vectype, NULL);
9715
9716
9717   /* Create the following def-use cycle:
9718      loop prolog:
9719          vec_init = ...
9720          vec_step = ...
9721      loop:
9722          vec_iv = PHI <vec_init, vec_loop>
9723          ...
9724          STMT
9725          ...
9726          vec_loop = vec_iv + vec_step;  */
9727
9728   /* Create the induction-phi that defines the induction-operand.  */
9729   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9730   induction_phi = create_phi_node (vec_dest, iv_loop->header);
9731   induc_def = PHI_RESULT (induction_phi);
9732
9733   /* Create the iv update inside the loop  */
9734   stmts = NULL;
9735   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9736   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
9737   vec_def = gimple_convert (&stmts, vectype, vec_def);
9738   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9739   new_stmt = SSA_NAME_DEF_STMT (vec_def);
9740
9741   /* Set the arguments of the phi node:  */
9742   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9743   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9744                UNKNOWN_LOCATION);
9745
9746   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9747   *vec_stmt = induction_phi;
9748
9749   /* In case that vectorization factor (VF) is bigger than the number
9750      of elements that we can fit in a vectype (nunits), we have to generate
9751      more than one vector stmt - i.e - we need to "unroll" the
9752      vector stmt by a factor VF/nunits.  For more details see documentation
9753      in vectorizable_operation.  */
9754
9755   if (ncopies > 1)
9756     {
9757       gimple_seq seq = NULL;
9758       /* FORNOW. This restriction should be relaxed.  */
9759       gcc_assert (!nested_in_vect_loop);
9760
9761       /* Create the vector that holds the step of the induction.  */
9762       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
9763         {
9764           expr = build_int_cst (integer_type_node, nunits);
9765           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
9766         }
9767       else
9768         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
9769       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
9770                                expr, step_expr);
9771       if (seq)
9772         {
9773           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
9774           gcc_assert (!new_bb);
9775         }
9776
9777       t = unshare_expr (new_name);
9778       gcc_assert (CONSTANT_CLASS_P (new_name)
9779                   || TREE_CODE (new_name) == SSA_NAME);
9780       new_vec = build_vector_from_val (step_vectype, t);
9781       vec_step = vect_init_vector (loop_vinfo, stmt_info,
9782                                    new_vec, step_vectype, NULL);
9783
9784       vec_def = induc_def;
9785       for (i = 1; i < ncopies; i++)
9786         {
9787           /* vec_i = vec_prev + vec_step  */
9788           gimple_seq stmts = NULL;
9789           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
9790           vec_def = gimple_build (&stmts,
9791                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
9792           vec_def = gimple_convert (&stmts, vectype, vec_def);
9793
9794           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9795           new_stmt = SSA_NAME_DEF_STMT (vec_def);
9796           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9797         }
9798     }
9799
9800   if (dump_enabled_p ())
9801     dump_printf_loc (MSG_NOTE, vect_location,
9802                      "transform induction: created def-use cycle: %G%G",
9803                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9804
9805   return true;
9806 }
9807
9808 /* Function vectorizable_live_operation.
9809
9810    STMT_INFO computes a value that is used outside the loop.  Check if
9811    it can be supported.  */
9812
9813 bool
9814 vectorizable_live_operation (vec_info *vinfo,
9815                              stmt_vec_info stmt_info,
9816                              gimple_stmt_iterator *gsi,
9817                              slp_tree slp_node, slp_instance slp_node_instance,
9818                              int slp_index, bool vec_stmt_p,
9819                              stmt_vector_for_cost *cost_vec)
9820 {
9821   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
9822   imm_use_iterator imm_iter;
9823   tree lhs, lhs_type, bitsize;
9824   tree vectype = (slp_node
9825                   ? SLP_TREE_VECTYPE (slp_node)
9826                   : STMT_VINFO_VECTYPE (stmt_info));
9827   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9828   int ncopies;
9829   gimple *use_stmt;
9830   auto_vec<tree> vec_oprnds;
9831   int vec_entry = 0;
9832   poly_uint64 vec_index = 0;
9833
9834   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
9835
9836   /* If a stmt of a reduction is live, vectorize it via
9837      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
9838      validity so just trigger the transform here.  */
9839   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
9840     {
9841       if (!vec_stmt_p)
9842         return true;
9843       if (slp_node)
9844         {
9845           /* For reduction chains the meta-info is attached to
9846              the group leader.  */
9847           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
9848             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
9849           /* For SLP reductions we vectorize the epilogue for
9850              all involved stmts together.  */
9851           else if (slp_index != 0)
9852             return true;
9853         }
9854       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
9855       gcc_assert (reduc_info->is_reduc_info);
9856       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
9857           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
9858         return true;
9859       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
9860                                         slp_node_instance);
9861       return true;
9862     }
9863
9864   /* If STMT is not relevant and it is a simple assignment and its inputs are
9865      invariant then it can remain in place, unvectorized.  The original last
9866      scalar value that it computes will be used.  */
9867   if (!STMT_VINFO_RELEVANT_P (stmt_info))
9868     {
9869       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
9870       if (dump_enabled_p ())
9871         dump_printf_loc (MSG_NOTE, vect_location,
9872                          "statement is simple and uses invariant.  Leaving in "
9873                          "place.\n");
9874       return true;
9875     }
9876
9877   if (slp_node)
9878     ncopies = 1;
9879   else
9880     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9881
9882   if (slp_node)
9883     {
9884       gcc_assert (slp_index >= 0);
9885
9886       /* Get the last occurrence of the scalar index from the concatenation of
9887          all the slp vectors. Calculate which slp vector it is and the index
9888          within.  */
9889       int num_scalar = SLP_TREE_LANES (slp_node);
9890       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9891       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
9892
9893       /* Calculate which vector contains the result, and which lane of
9894          that vector we need.  */
9895       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
9896         {
9897           if (dump_enabled_p ())
9898             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9899                              "Cannot determine which vector holds the"
9900                              " final result.\n");
9901           return false;
9902         }
9903     }
9904
9905   if (!vec_stmt_p)
9906     {
9907       /* No transformation required.  */
9908       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
9909         {
9910           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
9911                                                OPTIMIZE_FOR_SPEED))
9912             {
9913               if (dump_enabled_p ())
9914                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9915                                  "can't operate on partial vectors "
9916                                  "because the target doesn't support extract "
9917                                  "last reduction.\n");
9918               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
9919             }
9920           else if (slp_node)
9921             {
9922               if (dump_enabled_p ())
9923                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9924                                  "can't operate on partial vectors "
9925                                  "because an SLP statement is live after "
9926                                  "the loop.\n");
9927               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
9928             }
9929           else if (ncopies > 1)
9930             {
9931               if (dump_enabled_p ())
9932                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9933                                  "can't operate on partial vectors "
9934                                  "because ncopies is greater than 1.\n");
9935               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
9936             }
9937           else
9938             {
9939               gcc_assert (ncopies == 1 && !slp_node);
9940               vect_record_loop_mask (loop_vinfo,
9941                                      &LOOP_VINFO_MASKS (loop_vinfo),
9942                                      1, vectype, NULL);
9943             }
9944         }
9945       /* ???  Enable for loop costing as well.  */
9946       if (!loop_vinfo)
9947         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
9948                           0, vect_epilogue);
9949       return true;
9950     }
9951
9952   /* Use the lhs of the original scalar statement.  */
9953   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
9954   if (dump_enabled_p ())
9955     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
9956                      "stmt %G", stmt);
9957
9958   lhs = gimple_get_lhs (stmt);
9959   lhs_type = TREE_TYPE (lhs);
9960
9961   bitsize = vector_element_bits_tree (vectype);
9962
9963   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
9964   tree vec_lhs, bitstart;
9965   gimple *vec_stmt;
9966   if (slp_node)
9967     {
9968       gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
9969
9970       /* Get the correct slp vectorized stmt.  */
9971       vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
9972       vec_lhs = gimple_get_lhs (vec_stmt);
9973
9974       /* Get entry to use.  */
9975       bitstart = bitsize_int (vec_index);
9976       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
9977     }
9978   else
9979     {
9980       /* For multiple copies, get the last copy.  */
9981       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
9982       vec_lhs = gimple_get_lhs (vec_stmt);
9983
9984       /* Get the last lane in the vector.  */
9985       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
9986     }
9987
9988   if (loop_vinfo)
9989     {
9990       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
9991          requirement, insert one phi node for it.  It looks like:
9992            loop;
9993          BB:
9994            # lhs' = PHI <lhs>
9995          ==>
9996            loop;
9997          BB:
9998            # vec_lhs' = PHI <vec_lhs>
9999            new_tree = lane_extract <vec_lhs', ...>;
10000            lhs' = new_tree;  */
10001
10002       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10003       basic_block exit_bb = single_exit (loop)->dest;
10004       gcc_assert (single_pred_p (exit_bb));
10005
10006       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10007       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10008       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
10009
10010       gimple_seq stmts = NULL;
10011       tree new_tree;
10012       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10013         {
10014           /* Emit:
10015
10016                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10017
10018              where VEC_LHS is the vectorized live-out result and MASK is
10019              the loop mask for the final iteration.  */
10020           gcc_assert (ncopies == 1 && !slp_node);
10021           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10022           tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
10023                                           1, vectype, 0);
10024           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10025                                           mask, vec_lhs_phi);
10026
10027           /* Convert the extracted vector element to the scalar type.  */
10028           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10029         }
10030       else
10031         {
10032           tree bftype = TREE_TYPE (vectype);
10033           if (VECTOR_BOOLEAN_TYPE_P (vectype))
10034             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10035           new_tree = build3 (BIT_FIELD_REF, bftype,
10036                              vec_lhs_phi, bitsize, bitstart);
10037           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10038                                            &stmts, true, NULL_TREE);
10039         }
10040
10041       if (stmts)
10042         {
10043           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
10044           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10045
10046           /* Remove existing phi from lhs and create one copy from new_tree.  */
10047           tree lhs_phi = NULL_TREE;
10048           gimple_stmt_iterator gsi;
10049           for (gsi = gsi_start_phis (exit_bb);
10050                !gsi_end_p (gsi); gsi_next (&gsi))
10051             {
10052               gimple *phi = gsi_stmt (gsi);
10053               if ((gimple_phi_arg_def (phi, 0) == lhs))
10054                 {
10055                   remove_phi_node (&gsi, false);
10056                   lhs_phi = gimple_phi_result (phi);
10057                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10058                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10059                   break;
10060                 }
10061             }
10062         }
10063
10064       /* Replace use of lhs with newly computed result.  If the use stmt is a
10065          single arg PHI, just replace all uses of PHI result.  It's necessary
10066          because lcssa PHI defining lhs may be before newly inserted stmt.  */
10067       use_operand_p use_p;
10068       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10069         if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
10070             && !is_gimple_debug (use_stmt))
10071           {
10072             if (gimple_code (use_stmt) == GIMPLE_PHI
10073                 && gimple_phi_num_args (use_stmt) == 1)
10074               {
10075                 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
10076               }
10077             else
10078               {
10079                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10080                     SET_USE (use_p, new_tree);
10081               }
10082             update_stmt (use_stmt);
10083           }
10084     }
10085   else
10086     {
10087       /* For basic-block vectorization simply insert the lane-extraction.  */
10088       tree bftype = TREE_TYPE (vectype);
10089       if (VECTOR_BOOLEAN_TYPE_P (vectype))
10090         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10091       tree new_tree = build3 (BIT_FIELD_REF, bftype,
10092                               vec_lhs, bitsize, bitstart);
10093       gimple_seq stmts = NULL;
10094       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10095                                        &stmts, true, NULL_TREE);
10096       if (TREE_CODE (new_tree) == SSA_NAME
10097           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10098         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10099       if (is_a <gphi *> (vec_stmt))
10100         {
10101           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10102           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10103         }
10104       else
10105         {
10106           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10107           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10108         }
10109
10110       /* Replace use of lhs with newly computed result.  If the use stmt is a
10111          single arg PHI, just replace all uses of PHI result.  It's necessary
10112          because lcssa PHI defining lhs may be before newly inserted stmt.  */
10113       use_operand_p use_p;
10114       stmt_vec_info use_stmt_info;
10115       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10116         if (!is_gimple_debug (use_stmt)
10117             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10118                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10119           {
10120             /* ???  This can happen when the live lane ends up being
10121                used in a vector construction code-generated by an
10122                external SLP node (and code-generation for that already
10123                happened).  See gcc.dg/vect/bb-slp-47.c.
10124                Doing this is what would happen if that vector CTOR
10125                were not code-generated yet so it is not too bad.
10126                ???  In fact we'd likely want to avoid this situation
10127                in the first place.  */
10128             if (TREE_CODE (new_tree) == SSA_NAME
10129                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10130                 && gimple_code (use_stmt) != GIMPLE_PHI
10131                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10132                                                 use_stmt))
10133               {
10134                 enum tree_code code = gimple_assign_rhs_code (use_stmt);
10135                 gcc_assert (code == CONSTRUCTOR
10136                             || code == VIEW_CONVERT_EXPR
10137                             || CONVERT_EXPR_CODE_P (code));
10138                 if (dump_enabled_p ())
10139                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10140                                    "Using original scalar computation for "
10141                                    "live lane because use preceeds vector "
10142                                    "def\n");
10143                 continue;
10144               }
10145             /* ???  It can also happen that we end up pulling a def into
10146                a loop where replacing out-of-loop uses would require
10147                a new LC SSA PHI node.  Retain the original scalar in
10148                those cases as well.  PR98064.  */
10149             if (TREE_CODE (new_tree) == SSA_NAME
10150                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10151                 && (gimple_bb (use_stmt)->loop_father
10152                     != gimple_bb (vec_stmt)->loop_father)
10153                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10154                                         gimple_bb (use_stmt)->loop_father))
10155               {
10156                 if (dump_enabled_p ())
10157                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10158                                    "Using original scalar computation for "
10159                                    "live lane because there is an out-of-loop "
10160                                    "definition for it\n");
10161                 continue;
10162               }
10163             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10164               SET_USE (use_p, new_tree);
10165             update_stmt (use_stmt);
10166           }
10167     }
10168
10169   return true;
10170 }
10171
10172 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
10173
10174 static void
10175 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10176 {
10177   ssa_op_iter op_iter;
10178   imm_use_iterator imm_iter;
10179   def_operand_p def_p;
10180   gimple *ustmt;
10181
10182   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10183     {
10184       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10185         {
10186           basic_block bb;
10187
10188           if (!is_gimple_debug (ustmt))
10189             continue;
10190
10191           bb = gimple_bb (ustmt);
10192
10193           if (!flow_bb_inside_loop_p (loop, bb))
10194             {
10195               if (gimple_debug_bind_p (ustmt))
10196                 {
10197                   if (dump_enabled_p ())
10198                     dump_printf_loc (MSG_NOTE, vect_location,
10199                                      "killing debug use\n");
10200
10201                   gimple_debug_bind_reset_value (ustmt);
10202                   update_stmt (ustmt);
10203                 }
10204               else
10205                 gcc_unreachable ();
10206             }
10207         }
10208     }
10209 }
10210
10211 /* Given loop represented by LOOP_VINFO, return true if computation of
10212    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10213    otherwise.  */
10214
10215 static bool
10216 loop_niters_no_overflow (loop_vec_info loop_vinfo)
10217 {
10218   /* Constant case.  */
10219   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10220     {
10221       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10222       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10223
10224       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10225       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10226       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10227         return true;
10228     }
10229
10230   widest_int max;
10231   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10232   /* Check the upper bound of loop niters.  */
10233   if (get_max_loop_iterations (loop, &max))
10234     {
10235       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10236       signop sgn = TYPE_SIGN (type);
10237       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10238       if (max < type_max)
10239         return true;
10240     }
10241   return false;
10242 }
10243
10244 /* Return a mask type with half the number of elements as OLD_TYPE,
10245    given that it should have mode NEW_MODE.  */
10246
10247 tree
10248 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10249 {
10250   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10251   return build_truth_vector_type_for_mode (nunits, new_mode);
10252 }
10253
10254 /* Return a mask type with twice as many elements as OLD_TYPE,
10255    given that it should have mode NEW_MODE.  */
10256
10257 tree
10258 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10259 {
10260   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10261   return build_truth_vector_type_for_mode (nunits, new_mode);
10262 }
10263
10264 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10265    contain a sequence of NVECTORS masks that each control a vector of type
10266    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
10267    these vector masks with the vector version of SCALAR_MASK.  */
10268
10269 void
10270 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10271                        unsigned int nvectors, tree vectype, tree scalar_mask)
10272 {
10273   gcc_assert (nvectors != 0);
10274   if (masks->length () < nvectors)
10275     masks->safe_grow_cleared (nvectors, true);
10276   rgroup_controls *rgm = &(*masks)[nvectors - 1];
10277   /* The number of scalars per iteration and the number of vectors are
10278      both compile-time constants.  */
10279   unsigned int nscalars_per_iter
10280     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10281                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10282
10283   if (scalar_mask)
10284     {
10285       scalar_cond_masked_key cond (scalar_mask, nvectors);
10286       loop_vinfo->scalar_cond_masked_set.add (cond);
10287     }
10288
10289   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
10290     {
10291       rgm->max_nscalars_per_iter = nscalars_per_iter;
10292       rgm->type = truth_type_for (vectype);
10293       rgm->factor = 1;
10294     }
10295 }
10296
10297 /* Given a complete set of masks MASKS, extract mask number INDEX
10298    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10299    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
10300
10301    See the comment above vec_loop_masks for more details about the mask
10302    arrangement.  */
10303
10304 tree
10305 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10306                     unsigned int nvectors, tree vectype, unsigned int index)
10307 {
10308   rgroup_controls *rgm = &(*masks)[nvectors - 1];
10309   tree mask_type = rgm->type;
10310
10311   /* Populate the rgroup's mask array, if this is the first time we've
10312      used it.  */
10313   if (rgm->controls.is_empty ())
10314     {
10315       rgm->controls.safe_grow_cleared (nvectors, true);
10316       for (unsigned int i = 0; i < nvectors; ++i)
10317         {
10318           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10319           /* Provide a dummy definition until the real one is available.  */
10320           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10321           rgm->controls[i] = mask;
10322         }
10323     }
10324
10325   tree mask = rgm->controls[index];
10326   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10327                 TYPE_VECTOR_SUBPARTS (vectype)))
10328     {
10329       /* A loop mask for data type X can be reused for data type Y
10330          if X has N times more elements than Y and if Y's elements
10331          are N times bigger than X's.  In this case each sequence
10332          of N elements in the loop mask will be all-zero or all-one.
10333          We can then view-convert the mask so that each sequence of
10334          N elements is replaced by a single element.  */
10335       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10336                               TYPE_VECTOR_SUBPARTS (vectype)));
10337       gimple_seq seq = NULL;
10338       mask_type = truth_type_for (vectype);
10339       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10340       if (seq)
10341         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10342     }
10343   return mask;
10344 }
10345
10346 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10347    lengths for controlling an operation on VECTYPE.  The operation splits
10348    each element of VECTYPE into FACTOR separate subelements, measuring the
10349    length as a number of these subelements.  */
10350
10351 void
10352 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10353                       unsigned int nvectors, tree vectype, unsigned int factor)
10354 {
10355   gcc_assert (nvectors != 0);
10356   if (lens->length () < nvectors)
10357     lens->safe_grow_cleared (nvectors, true);
10358   rgroup_controls *rgl = &(*lens)[nvectors - 1];
10359
10360   /* The number of scalars per iteration, scalar occupied bytes and
10361      the number of vectors are both compile-time constants.  */
10362   unsigned int nscalars_per_iter
10363     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10364                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10365
10366   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10367     {
10368       /* For now, we only support cases in which all loads and stores fall back
10369          to VnQI or none do.  */
10370       gcc_assert (!rgl->max_nscalars_per_iter
10371                   || (rgl->factor == 1 && factor == 1)
10372                   || (rgl->max_nscalars_per_iter * rgl->factor
10373                       == nscalars_per_iter * factor));
10374       rgl->max_nscalars_per_iter = nscalars_per_iter;
10375       rgl->type = vectype;
10376       rgl->factor = factor;
10377     }
10378 }
10379
10380 /* Given a complete set of length LENS, extract length number INDEX for an
10381    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
10382
10383 tree
10384 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10385                    unsigned int nvectors, unsigned int index)
10386 {
10387   rgroup_controls *rgl = &(*lens)[nvectors - 1];
10388   bool use_bias_adjusted_len =
10389     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10390
10391   /* Populate the rgroup's len array, if this is the first time we've
10392      used it.  */
10393   if (rgl->controls.is_empty ())
10394     {
10395       rgl->controls.safe_grow_cleared (nvectors, true);
10396       for (unsigned int i = 0; i < nvectors; ++i)
10397         {
10398           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10399           gcc_assert (len_type != NULL_TREE);
10400
10401           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10402
10403           /* Provide a dummy definition until the real one is available.  */
10404           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10405           rgl->controls[i] = len;
10406
10407           if (use_bias_adjusted_len)
10408             {
10409               gcc_assert (i == 0);
10410               tree adjusted_len =
10411                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10412               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10413               rgl->bias_adjusted_ctrl = adjusted_len;
10414             }
10415         }
10416     }
10417
10418   if (use_bias_adjusted_len)
10419     return rgl->bias_adjusted_ctrl;
10420   else
10421     return rgl->controls[index];
10422 }
10423
10424 /* Scale profiling counters by estimation for LOOP which is vectorized
10425    by factor VF.  */
10426
10427 static void
10428 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
10429 {
10430   edge preheader = loop_preheader_edge (loop);
10431   /* Reduce loop iterations by the vectorization factor.  */
10432   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
10433   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
10434
10435   if (freq_h.nonzero_p ())
10436     {
10437       profile_probability p;
10438
10439       /* Avoid dropping loop body profile counter to 0 because of zero count
10440          in loop's preheader.  */
10441       if (!(freq_e == profile_count::zero ()))
10442         freq_e = freq_e.force_nonzero ();
10443       p = (freq_e * (new_est_niter + 1)).probability_in (freq_h);
10444       scale_loop_frequencies (loop, p);
10445     }
10446
10447   edge exit_e = single_exit (loop);
10448   exit_e->probability = profile_probability::always () / (new_est_niter + 1);
10449
10450   edge exit_l = single_pred_edge (loop->latch);
10451   profile_probability prob = exit_l->probability;
10452   exit_l->probability = exit_e->probability.invert ();
10453   if (prob.initialized_p () && exit_l->probability.initialized_p ())
10454     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
10455 }
10456
10457 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
10458    latch edge values originally defined by it.  */
10459
10460 static void
10461 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
10462                                      stmt_vec_info def_stmt_info)
10463 {
10464   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
10465   if (!def || TREE_CODE (def) != SSA_NAME)
10466     return;
10467   stmt_vec_info phi_info;
10468   imm_use_iterator iter;
10469   use_operand_p use_p;
10470   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
10471     {
10472       gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
10473       if (!phi)
10474         continue;
10475       if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
10476             && (phi_info = loop_vinfo->lookup_stmt (phi))
10477             && STMT_VINFO_RELEVANT_P (phi_info)))
10478         continue;
10479       loop_p loop = gimple_bb (phi)->loop_father;
10480       edge e = loop_latch_edge (loop);
10481       if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
10482         continue;
10483
10484       if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
10485           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
10486           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
10487         {
10488           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
10489           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
10490           gcc_assert (phi_defs.length () == latch_defs.length ());
10491           for (unsigned i = 0; i < phi_defs.length (); ++i)
10492             add_phi_arg (as_a <gphi *> (phi_defs[i]),
10493                          gimple_get_lhs (latch_defs[i]), e,
10494                          gimple_phi_arg_location (phi, e->dest_idx));
10495         }
10496       else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
10497         {
10498           /* For first order recurrences we have to update both uses of
10499              the latch definition, the one in the PHI node and the one
10500              in the generated VEC_PERM_EXPR.  */
10501           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
10502           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
10503           gcc_assert (phi_defs.length () == latch_defs.length ());
10504           tree phidef = gimple_assign_rhs1 (phi_defs[0]);
10505           gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
10506           for (unsigned i = 0; i < phi_defs.length (); ++i)
10507             {
10508               gassign *perm = as_a <gassign *> (phi_defs[i]);
10509               if (i > 0)
10510                 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
10511               gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
10512               update_stmt (perm);
10513             }
10514           add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
10515                        gimple_phi_arg_location (phi, e->dest_idx));
10516         }
10517     }
10518 }
10519
10520 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
10521    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
10522    stmt_vec_info.  */
10523
10524 static bool
10525 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
10526                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
10527 {
10528   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10529   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10530
10531   if (dump_enabled_p ())
10532     dump_printf_loc (MSG_NOTE, vect_location,
10533                      "------>vectorizing statement: %G", stmt_info->stmt);
10534
10535   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
10536     vect_loop_kill_debug_uses (loop, stmt_info);
10537
10538   if (!STMT_VINFO_RELEVANT_P (stmt_info)
10539       && !STMT_VINFO_LIVE_P (stmt_info))
10540     return false;
10541
10542   if (STMT_VINFO_VECTYPE (stmt_info))
10543     {
10544       poly_uint64 nunits
10545         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
10546       if (!STMT_SLP_TYPE (stmt_info)
10547           && maybe_ne (nunits, vf)
10548           && dump_enabled_p ())
10549         /* For SLP VF is set according to unrolling factor, and not
10550            to vector size, hence for SLP this print is not valid.  */
10551         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
10552     }
10553
10554   /* Pure SLP statements have already been vectorized.  We still need
10555      to apply loop vectorization to hybrid SLP statements.  */
10556   if (PURE_SLP_STMT (stmt_info))
10557     return false;
10558
10559   if (dump_enabled_p ())
10560     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
10561
10562   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
10563     *seen_store = stmt_info;
10564
10565   return true;
10566 }
10567
10568 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
10569    in the hash_map with its corresponding values.  */
10570
10571 static tree
10572 find_in_mapping (tree t, void *context)
10573 {
10574   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
10575
10576   tree *value = mapping->get (t);
10577   return value ? *value : t;
10578 }
10579
10580 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
10581    original loop that has now been vectorized.
10582
10583    The inits of the data_references need to be advanced with the number of
10584    iterations of the main loop.  This has been computed in vect_do_peeling and
10585    is stored in parameter ADVANCE.  We first restore the data_references
10586    initial offset with the values recored in ORIG_DRS_INIT.
10587
10588    Since the loop_vec_info of this EPILOGUE was constructed for the original
10589    loop, its stmt_vec_infos all point to the original statements.  These need
10590    to be updated to point to their corresponding copies as well as the SSA_NAMES
10591    in their PATTERN_DEF_SEQs and RELATED_STMTs.
10592
10593    The data_reference's connections also need to be updated.  Their
10594    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
10595    stmt_vec_infos, their statements need to point to their corresponding copy,
10596    if they are gather loads or scatter stores then their reference needs to be
10597    updated to point to its corresponding copy and finally we set
10598    'base_misaligned' to false as we have already peeled for alignment in the
10599    prologue of the main loop.  */
10600
10601 static void
10602 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
10603 {
10604   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
10605   auto_vec<gimple *> stmt_worklist;
10606   hash_map<tree,tree> mapping;
10607   gimple *orig_stmt, *new_stmt;
10608   gimple_stmt_iterator epilogue_gsi;
10609   gphi_iterator epilogue_phi_gsi;
10610   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
10611   basic_block *epilogue_bbs = get_loop_body (epilogue);
10612   unsigned i;
10613
10614   free (LOOP_VINFO_BBS (epilogue_vinfo));
10615   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
10616
10617   /* Advance data_reference's with the number of iterations of the previous
10618      loop and its prologue.  */
10619   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
10620
10621
10622   /* The EPILOGUE loop is a copy of the original loop so they share the same
10623      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
10624      point to the copied statements.  We also create a mapping of all LHS' in
10625      the original loop and all the LHS' in the EPILOGUE and create worklists to
10626      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
10627   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
10628     {
10629       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
10630            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
10631         {
10632           new_stmt = epilogue_phi_gsi.phi ();
10633
10634           gcc_assert (gimple_uid (new_stmt) > 0);
10635           stmt_vinfo
10636             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10637
10638           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
10639           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10640
10641           mapping.put (gimple_phi_result (orig_stmt),
10642                        gimple_phi_result (new_stmt));
10643           /* PHI nodes can not have patterns or related statements.  */
10644           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
10645                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
10646         }
10647
10648       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
10649            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
10650         {
10651           new_stmt = gsi_stmt (epilogue_gsi);
10652           if (is_gimple_debug (new_stmt))
10653             continue;
10654
10655           gcc_assert (gimple_uid (new_stmt) > 0);
10656           stmt_vinfo
10657             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10658
10659           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
10660           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10661
10662           if (tree old_lhs = gimple_get_lhs (orig_stmt))
10663             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
10664
10665           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
10666             {
10667               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
10668               for (gimple_stmt_iterator gsi = gsi_start (seq);
10669                    !gsi_end_p (gsi); gsi_next (&gsi))
10670                 stmt_worklist.safe_push (gsi_stmt (gsi));
10671             }
10672
10673           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
10674           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
10675             {
10676               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
10677               stmt_worklist.safe_push (stmt);
10678               /* Set BB such that the assert in
10679                 'get_initial_def_for_reduction' is able to determine that
10680                 the BB of the related stmt is inside this loop.  */
10681               gimple_set_bb (stmt,
10682                              gimple_bb (new_stmt));
10683               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
10684               gcc_assert (related_vinfo == NULL
10685                           || related_vinfo == stmt_vinfo);
10686             }
10687         }
10688     }
10689
10690   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
10691      using the original main loop and thus need to be updated to refer to the
10692      cloned variables used in the epilogue.  */
10693   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
10694     {
10695       gimple *stmt = stmt_worklist[i];
10696       tree *new_op;
10697
10698       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
10699         {
10700           tree op = gimple_op (stmt, j);
10701           if ((new_op = mapping.get(op)))
10702             gimple_set_op (stmt, j, *new_op);
10703           else
10704             {
10705               /* PR92429: The last argument of simplify_replace_tree disables
10706                  folding when replacing arguments.  This is required as
10707                  otherwise you might end up with different statements than the
10708                  ones analyzed in vect_loop_analyze, leading to different
10709                  vectorization.  */
10710               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
10711                                           &find_in_mapping, &mapping, false);
10712               gimple_set_op (stmt, j, op);
10713             }
10714         }
10715     }
10716
10717   struct data_reference *dr;
10718   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
10719   FOR_EACH_VEC_ELT (datarefs, i, dr)
10720     {
10721       orig_stmt = DR_STMT (dr);
10722       gcc_assert (gimple_uid (orig_stmt) > 0);
10723       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
10724       /* Data references for gather loads and scatter stores do not use the
10725          updated offset we set using ADVANCE.  Instead we have to make sure the
10726          reference in the data references point to the corresponding copy of
10727          the original in the epilogue.  */
10728       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
10729           == VMAT_GATHER_SCATTER)
10730         {
10731           DR_REF (dr)
10732             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
10733                                      &find_in_mapping, &mapping);
10734           DR_BASE_ADDRESS (dr)
10735             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
10736                                      &find_in_mapping, &mapping);
10737         }
10738       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
10739       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
10740       /* The vector size of the epilogue is smaller than that of the main loop
10741          so the alignment is either the same or lower. This means the dr will
10742          thus by definition be aligned.  */
10743       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
10744     }
10745
10746   epilogue_vinfo->shared->datarefs_copy.release ();
10747   epilogue_vinfo->shared->save_datarefs ();
10748 }
10749
10750 /* Function vect_transform_loop.
10751
10752    The analysis phase has determined that the loop is vectorizable.
10753    Vectorize the loop - created vectorized stmts to replace the scalar
10754    stmts in the loop, and update the loop exit condition.
10755    Returns scalar epilogue loop if any.  */
10756
10757 class loop *
10758 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
10759 {
10760   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10761   class loop *epilogue = NULL;
10762   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
10763   int nbbs = loop->num_nodes;
10764   int i;
10765   tree niters_vector = NULL_TREE;
10766   tree step_vector = NULL_TREE;
10767   tree niters_vector_mult_vf = NULL_TREE;
10768   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10769   unsigned int lowest_vf = constant_lower_bound (vf);
10770   gimple *stmt;
10771   bool check_profitability = false;
10772   unsigned int th;
10773
10774   DUMP_VECT_SCOPE ("vec_transform_loop");
10775
10776   loop_vinfo->shared->check_datarefs ();
10777
10778   /* Use the more conservative vectorization threshold.  If the number
10779      of iterations is constant assume the cost check has been performed
10780      by our caller.  If the threshold makes all loops profitable that
10781      run at least the (estimated) vectorization factor number of times
10782      checking is pointless, too.  */
10783   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
10784   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
10785     {
10786       if (dump_enabled_p ())
10787         dump_printf_loc (MSG_NOTE, vect_location,
10788                          "Profitability threshold is %d loop iterations.\n",
10789                          th);
10790       check_profitability = true;
10791     }
10792
10793   /* Make sure there exists a single-predecessor exit bb.  Do this before
10794      versioning.   */
10795   edge e = single_exit (loop);
10796   if (! single_pred_p (e->dest))
10797     {
10798       split_loop_exit_edge (e, true);
10799       if (dump_enabled_p ())
10800         dump_printf (MSG_NOTE, "split exit edge\n");
10801     }
10802
10803   /* Version the loop first, if required, so the profitability check
10804      comes first.  */
10805
10806   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
10807     {
10808       class loop *sloop
10809         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
10810       sloop->force_vectorize = false;
10811       check_profitability = false;
10812     }
10813
10814   /* Make sure there exists a single-predecessor exit bb also on the
10815      scalar loop copy.  Do this after versioning but before peeling
10816      so CFG structure is fine for both scalar and if-converted loop
10817      to make slpeel_duplicate_current_defs_from_edges face matched
10818      loop closed PHI nodes on the exit.  */
10819   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
10820     {
10821       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
10822       if (! single_pred_p (e->dest))
10823         {
10824           split_loop_exit_edge (e, true);
10825           if (dump_enabled_p ())
10826             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
10827         }
10828     }
10829
10830   tree niters = vect_build_loop_niters (loop_vinfo);
10831   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
10832   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
10833   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
10834   tree advance;
10835   drs_init_vec orig_drs_init;
10836
10837   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
10838                               &step_vector, &niters_vector_mult_vf, th,
10839                               check_profitability, niters_no_overflow,
10840                               &advance);
10841
10842   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
10843       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
10844     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
10845                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
10846
10847   if (niters_vector == NULL_TREE)
10848     {
10849       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
10850           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
10851           && known_eq (lowest_vf, vf))
10852         {
10853           niters_vector
10854             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
10855                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
10856           step_vector = build_one_cst (TREE_TYPE (niters));
10857         }
10858       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
10859         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
10860                                      &step_vector, niters_no_overflow);
10861       else
10862         /* vect_do_peeling subtracted the number of peeled prologue
10863            iterations from LOOP_VINFO_NITERS.  */
10864         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
10865                                      &niters_vector, &step_vector,
10866                                      niters_no_overflow);
10867     }
10868
10869   /* 1) Make sure the loop header has exactly two entries
10870      2) Make sure we have a preheader basic block.  */
10871
10872   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
10873
10874   split_edge (loop_preheader_edge (loop));
10875
10876   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
10877     /* This will deal with any possible peeling.  */
10878     vect_prepare_for_masked_peels (loop_vinfo);
10879
10880   /* Schedule the SLP instances first, then handle loop vectorization
10881      below.  */
10882   if (!loop_vinfo->slp_instances.is_empty ())
10883     {
10884       DUMP_VECT_SCOPE ("scheduling SLP instances");
10885       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
10886     }
10887
10888   /* FORNOW: the vectorizer supports only loops which body consist
10889      of one basic block (header + empty latch). When the vectorizer will
10890      support more involved loop forms, the order by which the BBs are
10891      traversed need to be reconsidered.  */
10892
10893   for (i = 0; i < nbbs; i++)
10894     {
10895       basic_block bb = bbs[i];
10896       stmt_vec_info stmt_info;
10897
10898       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
10899            gsi_next (&si))
10900         {
10901           gphi *phi = si.phi ();
10902           if (dump_enabled_p ())
10903             dump_printf_loc (MSG_NOTE, vect_location,
10904                              "------>vectorizing phi: %G", (gimple *) phi);
10905           stmt_info = loop_vinfo->lookup_stmt (phi);
10906           if (!stmt_info)
10907             continue;
10908
10909           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
10910             vect_loop_kill_debug_uses (loop, stmt_info);
10911
10912           if (!STMT_VINFO_RELEVANT_P (stmt_info)
10913               && !STMT_VINFO_LIVE_P (stmt_info))
10914             continue;
10915
10916           if (STMT_VINFO_VECTYPE (stmt_info)
10917               && (maybe_ne
10918                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
10919               && dump_enabled_p ())
10920             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
10921
10922           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
10923                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
10924                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
10925                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
10926                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
10927                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
10928               && ! PURE_SLP_STMT (stmt_info))
10929             {
10930               if (dump_enabled_p ())
10931                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
10932               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
10933             }
10934         }
10935
10936       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
10937            gsi_next (&si))
10938         {
10939           gphi *phi = si.phi ();
10940           stmt_info = loop_vinfo->lookup_stmt (phi);
10941           if (!stmt_info)
10942             continue;
10943
10944           if (!STMT_VINFO_RELEVANT_P (stmt_info)
10945               && !STMT_VINFO_LIVE_P (stmt_info))
10946             continue;
10947
10948           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
10949                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
10950                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
10951                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
10952                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
10953                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
10954               && ! PURE_SLP_STMT (stmt_info))
10955             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
10956         }
10957
10958       for (gimple_stmt_iterator si = gsi_start_bb (bb);
10959            !gsi_end_p (si);)
10960         {
10961           stmt = gsi_stmt (si);
10962           /* During vectorization remove existing clobber stmts.  */
10963           if (gimple_clobber_p (stmt))
10964             {
10965               unlink_stmt_vdef (stmt);
10966               gsi_remove (&si, true);
10967               release_defs (stmt);
10968             }
10969           else
10970             {
10971               /* Ignore vector stmts created in the outer loop.  */
10972               stmt_info = loop_vinfo->lookup_stmt (stmt);
10973
10974               /* vector stmts created in the outer-loop during vectorization of
10975                  stmts in an inner-loop may not have a stmt_info, and do not
10976                  need to be vectorized.  */
10977               stmt_vec_info seen_store = NULL;
10978               if (stmt_info)
10979                 {
10980                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
10981                     {
10982                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
10983                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
10984                            !gsi_end_p (subsi); gsi_next (&subsi))
10985                         {
10986                           stmt_vec_info pat_stmt_info
10987                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
10988                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
10989                                                     &si, &seen_store);
10990                         }
10991                       stmt_vec_info pat_stmt_info
10992                         = STMT_VINFO_RELATED_STMT (stmt_info);
10993                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
10994                                                     &si, &seen_store))
10995                         maybe_set_vectorized_backedge_value (loop_vinfo,
10996                                                              pat_stmt_info);
10997                     }
10998                   else
10999                     {
11000                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
11001                                                     &seen_store))
11002                         maybe_set_vectorized_backedge_value (loop_vinfo,
11003                                                              stmt_info);
11004                     }
11005                 }
11006               gsi_next (&si);
11007               if (seen_store)
11008                 {
11009                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
11010                     /* Interleaving.  If IS_STORE is TRUE, the
11011                        vectorization of the interleaving chain was
11012                        completed - free all the stores in the chain.  */
11013                     vect_remove_stores (loop_vinfo,
11014                                         DR_GROUP_FIRST_ELEMENT (seen_store));
11015                   else
11016                     /* Free the attached stmt_vec_info and remove the stmt.  */
11017                     loop_vinfo->remove_stmt (stmt_info);
11018                 }
11019             }
11020         }
11021
11022       /* Stub out scalar statements that must not survive vectorization.
11023          Doing this here helps with grouped statements, or statements that
11024          are involved in patterns.  */
11025       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11026            !gsi_end_p (gsi); gsi_next (&gsi))
11027         {
11028           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11029           if (!call || !gimple_call_internal_p (call))
11030             continue;
11031           internal_fn ifn = gimple_call_internal_fn (call);
11032           if (ifn == IFN_MASK_LOAD)
11033             {
11034               tree lhs = gimple_get_lhs (call);
11035               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11036                 {
11037                   tree zero = build_zero_cst (TREE_TYPE (lhs));
11038                   gimple *new_stmt = gimple_build_assign (lhs, zero);
11039                   gsi_replace (&gsi, new_stmt, true);
11040                 }
11041             }
11042           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11043             {
11044               tree lhs = gimple_get_lhs (call);
11045               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11046                 {
11047                   tree else_arg
11048                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11049                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11050                   gsi_replace (&gsi, new_stmt, true);
11051                 }
11052             }
11053         }
11054     }                           /* BBs in loop */
11055
11056   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11057      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
11058   if (integer_onep (step_vector))
11059     niters_no_overflow = true;
11060   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
11061                            niters_vector_mult_vf, !niters_no_overflow);
11062
11063   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11064   scale_profile_for_vect_loop (loop, assumed_vf);
11065
11066   /* True if the final iteration might not handle a full vector's
11067      worth of scalar iterations.  */
11068   bool final_iter_may_be_partial
11069     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11070   /* The minimum number of iterations performed by the epilogue.  This
11071      is 1 when peeling for gaps because we always need a final scalar
11072      iteration.  */
11073   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11074   /* +1 to convert latch counts to loop iteration counts,
11075      -min_epilogue_iters to remove iterations that cannot be performed
11076        by the vector code.  */
11077   int bias_for_lowest = 1 - min_epilogue_iters;
11078   int bias_for_assumed = bias_for_lowest;
11079   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11080   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11081     {
11082       /* When the amount of peeling is known at compile time, the first
11083          iteration will have exactly alignment_npeels active elements.
11084          In the worst case it will have at least one.  */
11085       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11086       bias_for_lowest += lowest_vf - min_first_active;
11087       bias_for_assumed += assumed_vf - min_first_active;
11088     }
11089   /* In these calculations the "- 1" converts loop iteration counts
11090      back to latch counts.  */
11091   if (loop->any_upper_bound)
11092     {
11093       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11094       loop->nb_iterations_upper_bound
11095         = (final_iter_may_be_partial
11096            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11097                             lowest_vf) - 1
11098            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11099                              lowest_vf) - 1);
11100       if (main_vinfo
11101           /* Both peeling for alignment and peeling for gaps can end up
11102              with the scalar epilogue running for more than VF-1 iterations.  */
11103           && !main_vinfo->peeling_for_alignment
11104           && !main_vinfo->peeling_for_gaps)
11105         {
11106           unsigned int bound;
11107           poly_uint64 main_iters
11108             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11109                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11110           main_iters
11111             = upper_bound (main_iters,
11112                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11113           if (can_div_away_from_zero_p (main_iters,
11114                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11115                                         &bound))
11116             loop->nb_iterations_upper_bound
11117               = wi::umin ((widest_int) (bound - 1),
11118                           loop->nb_iterations_upper_bound);
11119       }
11120   }
11121   if (loop->any_likely_upper_bound)
11122     loop->nb_iterations_likely_upper_bound
11123       = (final_iter_may_be_partial
11124          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11125                           + bias_for_lowest, lowest_vf) - 1
11126          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11127                            + bias_for_lowest, lowest_vf) - 1);
11128   if (loop->any_estimate)
11129     loop->nb_iterations_estimate
11130       = (final_iter_may_be_partial
11131          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11132                           assumed_vf) - 1
11133          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11134                            assumed_vf) - 1);
11135
11136   if (dump_enabled_p ())
11137     {
11138       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11139         {
11140           dump_printf_loc (MSG_NOTE, vect_location,
11141                            "LOOP VECTORIZED\n");
11142           if (loop->inner)
11143             dump_printf_loc (MSG_NOTE, vect_location,
11144                              "OUTER LOOP VECTORIZED\n");
11145           dump_printf (MSG_NOTE, "\n");
11146         }
11147       else
11148         dump_printf_loc (MSG_NOTE, vect_location,
11149                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11150                          GET_MODE_NAME (loop_vinfo->vector_mode));
11151     }
11152
11153   /* Loops vectorized with a variable factor won't benefit from
11154      unrolling/peeling.  */
11155   if (!vf.is_constant ())
11156     {
11157       loop->unroll = 1;
11158       if (dump_enabled_p ())
11159         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11160                          " variable-length vectorization factor\n");
11161     }
11162   /* Free SLP instances here because otherwise stmt reference counting
11163      won't work.  */
11164   slp_instance instance;
11165   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11166     vect_free_slp_instance (instance);
11167   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11168   /* Clear-up safelen field since its value is invalid after vectorization
11169      since vectorized loop can have loop-carried dependencies.  */
11170   loop->safelen = 0;
11171
11172   if (epilogue)
11173     {
11174       update_epilogue_loop_vinfo (epilogue, advance);
11175
11176       epilogue->simduid = loop->simduid;
11177       epilogue->force_vectorize = loop->force_vectorize;
11178       epilogue->dont_vectorize = false;
11179     }
11180
11181   return epilogue;
11182 }
11183
11184 /* The code below is trying to perform simple optimization - revert
11185    if-conversion for masked stores, i.e. if the mask of a store is zero
11186    do not perform it and all stored value producers also if possible.
11187    For example,
11188      for (i=0; i<n; i++)
11189        if (c[i])
11190         {
11191           p1[i] += 1;
11192           p2[i] = p3[i] +2;
11193         }
11194    this transformation will produce the following semi-hammock:
11195
11196    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11197      {
11198        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11199        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11200        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11201        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11202        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11203        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11204      }
11205 */
11206
11207 void
11208 optimize_mask_stores (class loop *loop)
11209 {
11210   basic_block *bbs = get_loop_body (loop);
11211   unsigned nbbs = loop->num_nodes;
11212   unsigned i;
11213   basic_block bb;
11214   class loop *bb_loop;
11215   gimple_stmt_iterator gsi;
11216   gimple *stmt;
11217   auto_vec<gimple *> worklist;
11218   auto_purge_vect_location sentinel;
11219
11220   vect_location = find_loop_location (loop);
11221   /* Pick up all masked stores in loop if any.  */
11222   for (i = 0; i < nbbs; i++)
11223     {
11224       bb = bbs[i];
11225       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11226            gsi_next (&gsi))
11227         {
11228           stmt = gsi_stmt (gsi);
11229           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11230             worklist.safe_push (stmt);
11231         }
11232     }
11233
11234   free (bbs);
11235   if (worklist.is_empty ())
11236     return;
11237
11238   /* Loop has masked stores.  */
11239   while (!worklist.is_empty ())
11240     {
11241       gimple *last, *last_store;
11242       edge e, efalse;
11243       tree mask;
11244       basic_block store_bb, join_bb;
11245       gimple_stmt_iterator gsi_to;
11246       tree vdef, new_vdef;
11247       gphi *phi;
11248       tree vectype;
11249       tree zero;
11250
11251       last = worklist.pop ();
11252       mask = gimple_call_arg (last, 2);
11253       bb = gimple_bb (last);
11254       /* Create then_bb and if-then structure in CFG, then_bb belongs to
11255          the same loop as if_bb.  It could be different to LOOP when two
11256          level loop-nest is vectorized and mask_store belongs to the inner
11257          one.  */
11258       e = split_block (bb, last);
11259       bb_loop = bb->loop_father;
11260       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11261       join_bb = e->dest;
11262       store_bb = create_empty_bb (bb);
11263       add_bb_to_loop (store_bb, bb_loop);
11264       e->flags = EDGE_TRUE_VALUE;
11265       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11266       /* Put STORE_BB to likely part.  */
11267       efalse->probability = profile_probability::unlikely ();
11268       store_bb->count = efalse->count ();
11269       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11270       if (dom_info_available_p (CDI_DOMINATORS))
11271         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11272       if (dump_enabled_p ())
11273         dump_printf_loc (MSG_NOTE, vect_location,
11274                          "Create new block %d to sink mask stores.",
11275                          store_bb->index);
11276       /* Create vector comparison with boolean result.  */
11277       vectype = TREE_TYPE (mask);
11278       zero = build_zero_cst (vectype);
11279       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11280       gsi = gsi_last_bb (bb);
11281       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11282       /* Create new PHI node for vdef of the last masked store:
11283          .MEM_2 = VDEF <.MEM_1>
11284          will be converted to
11285          .MEM.3 = VDEF <.MEM_1>
11286          and new PHI node will be created in join bb
11287          .MEM_2 = PHI <.MEM_1, .MEM_3>
11288       */
11289       vdef = gimple_vdef (last);
11290       new_vdef = make_ssa_name (gimple_vop (cfun), last);
11291       gimple_set_vdef (last, new_vdef);
11292       phi = create_phi_node (vdef, join_bb);
11293       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11294
11295       /* Put all masked stores with the same mask to STORE_BB if possible.  */
11296       while (true)
11297         {
11298           gimple_stmt_iterator gsi_from;
11299           gimple *stmt1 = NULL;
11300
11301           /* Move masked store to STORE_BB.  */
11302           last_store = last;
11303           gsi = gsi_for_stmt (last);
11304           gsi_from = gsi;
11305           /* Shift GSI to the previous stmt for further traversal.  */
11306           gsi_prev (&gsi);
11307           gsi_to = gsi_start_bb (store_bb);
11308           gsi_move_before (&gsi_from, &gsi_to);
11309           /* Setup GSI_TO to the non-empty block start.  */
11310           gsi_to = gsi_start_bb (store_bb);
11311           if (dump_enabled_p ())
11312             dump_printf_loc (MSG_NOTE, vect_location,
11313                              "Move stmt to created bb\n%G", last);
11314           /* Move all stored value producers if possible.  */
11315           while (!gsi_end_p (gsi))
11316             {
11317               tree lhs;
11318               imm_use_iterator imm_iter;
11319               use_operand_p use_p;
11320               bool res;
11321
11322               /* Skip debug statements.  */
11323               if (is_gimple_debug (gsi_stmt (gsi)))
11324                 {
11325                   gsi_prev (&gsi);
11326                   continue;
11327                 }
11328               stmt1 = gsi_stmt (gsi);
11329               /* Do not consider statements writing to memory or having
11330                  volatile operand.  */
11331               if (gimple_vdef (stmt1)
11332                   || gimple_has_volatile_ops (stmt1))
11333                 break;
11334               gsi_from = gsi;
11335               gsi_prev (&gsi);
11336               lhs = gimple_get_lhs (stmt1);
11337               if (!lhs)
11338                 break;
11339
11340               /* LHS of vectorized stmt must be SSA_NAME.  */
11341               if (TREE_CODE (lhs) != SSA_NAME)
11342                 break;
11343
11344               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11345                 {
11346                   /* Remove dead scalar statement.  */
11347                   if (has_zero_uses (lhs))
11348                     {
11349                       gsi_remove (&gsi_from, true);
11350                       continue;
11351                     }
11352                 }
11353
11354               /* Check that LHS does not have uses outside of STORE_BB.  */
11355               res = true;
11356               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11357                 {
11358                   gimple *use_stmt;
11359                   use_stmt = USE_STMT (use_p);
11360                   if (is_gimple_debug (use_stmt))
11361                     continue;
11362                   if (gimple_bb (use_stmt) != store_bb)
11363                     {
11364                       res = false;
11365                       break;
11366                     }
11367                 }
11368               if (!res)
11369                 break;
11370
11371               if (gimple_vuse (stmt1)
11372                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
11373                 break;
11374
11375               /* Can move STMT1 to STORE_BB.  */
11376               if (dump_enabled_p ())
11377                 dump_printf_loc (MSG_NOTE, vect_location,
11378                                  "Move stmt to created bb\n%G", stmt1);
11379               gsi_move_before (&gsi_from, &gsi_to);
11380               /* Shift GSI_TO for further insertion.  */
11381               gsi_prev (&gsi_to);
11382             }
11383           /* Put other masked stores with the same mask to STORE_BB.  */
11384           if (worklist.is_empty ()
11385               || gimple_call_arg (worklist.last (), 2) != mask
11386               || worklist.last () != stmt1)
11387             break;
11388           last = worklist.pop ();
11389         }
11390       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11391     }
11392 }
11393
11394 /* Decide whether it is possible to use a zero-based induction variable
11395    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
11396    the value that the induction variable must be able to hold in order
11397    to ensure that the rgroups eventually have no active vector elements.
11398    Return -1 otherwise.  */
11399
11400 widest_int
11401 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11402 {
11403   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11404   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11405   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11406
11407   /* Calculate the value that the induction variable must be able
11408      to hit in order to ensure that we end the loop with an all-false mask.
11409      This involves adding the maximum number of inactive trailing scalar
11410      iterations.  */
11411   widest_int iv_limit = -1;
11412   if (max_loop_iterations (loop, &iv_limit))
11413     {
11414       if (niters_skip)
11415         {
11416           /* Add the maximum number of skipped iterations to the
11417              maximum iteration count.  */
11418           if (TREE_CODE (niters_skip) == INTEGER_CST)
11419             iv_limit += wi::to_widest (niters_skip);
11420           else
11421             iv_limit += max_vf - 1;
11422         }
11423       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11424         /* Make a conservatively-correct assumption.  */
11425         iv_limit += max_vf - 1;
11426
11427       /* IV_LIMIT is the maximum number of latch iterations, which is also
11428          the maximum in-range IV value.  Round this value down to the previous
11429          vector alignment boundary and then add an extra full iteration.  */
11430       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11431       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
11432     }
11433   return iv_limit;
11434 }
11435
11436 /* For the given rgroup_controls RGC, check whether an induction variable
11437    would ever hit a value that produces a set of all-false masks or zero
11438    lengths before wrapping around.  Return true if it's possible to wrap
11439    around before hitting the desirable value, otherwise return false.  */
11440
11441 bool
11442 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
11443 {
11444   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
11445
11446   if (iv_limit == -1)
11447     return true;
11448
11449   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11450   unsigned int compare_precision = TYPE_PRECISION (compare_type);
11451   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
11452
11453   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
11454     return true;
11455
11456   return false;
11457 }