gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "diagnostic-core.h"
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "cfganal.h"
  39 #include "gimplify.h"
  40 #include "gimple-iterator.h"
  41 #include "gimplify-me.h"
  42 #include "tree-ssa-loop-ivopts.h"
  43 #include "tree-ssa-loop-manip.h"
  44 #include "tree-ssa-loop-niter.h"
  45 #include "tree-ssa-loop.h"
  46 #include "cfgloop.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57 #include "case-cfn-macros.h"
  58 #include "langhooks.h"
  59
  60 /* Loop Vectorization Pass.
  61
  62    This pass tries to vectorize loops.
  63
  64    For example, the vectorizer transforms the following simple loop:
  65
  66         short a[N]; short b[N]; short c[N]; int i;
  67
  68         for (i=0; i<N; i++){
  69           a[i] = b[i] + c[i];
  70         }
  71
  72    as if it was manually vectorized by rewriting the source code into:
  73
  74         typedef int __attribute__((mode(V8HI))) v8hi;
  75         short a[N];  short b[N]; short c[N];   int i;
  76         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  77         v8hi va, vb, vc;
  78
  79         for (i=0; i<N/8; i++){
  80           vb = pb[i];
  81           vc = pc[i];
  82           va = vb + vc;
  83           pa[i] = va;
  84         }
  85
  86         The main entry to this pass is vectorize_loops(), in which
  87    the vectorizer applies a set of analyses on a given set of loops,
  88    followed by the actual vectorization transformation for the loops that
  89    had successfully passed the analysis phase.
  90         Throughout this pass we make a distinction between two types of
  91    data: scalars (which are represented by SSA_NAMES), and memory references
  92    ("data-refs").  These two types of data require different handling both
  93    during analysis and transformation. The types of data-refs that the
  94    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  95    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  96    accesses are required to have a simple (consecutive) access pattern.
  97
  98    Analysis phase:
  99    ===============
 100         The driver for the analysis phase is vect_analyze_loop().
 101    It applies a set of analyses, some of which rely on the scalar evolution
 102    analyzer (scev) developed by Sebastian Pop.
 103
 104         During the analysis phase the vectorizer records some information
 105    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 106    loop, as well as general information about the loop as a whole, which is
 107    recorded in a "loop_vec_info" struct attached to each loop.
 108
 109    Transformation phase:
 110    =====================
 111         The loop transformation phase scans all the stmts in the loop, and
 112    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 113    the loop that needs to be vectorized.  It inserts the vector code sequence
 114    just before the scalar stmt S, and records a pointer to the vector code
 115    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 116    attached to S).  This pointer will be used for the vectorization of following
 117    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 118    otherwise, we rely on dead code elimination for removing it.
 119
 120         For example, say stmt S1 was vectorized into stmt VS1:
 121
 122    VS1: vb = px[i];
 123    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 124    S2:  a = b;
 125
 126    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 127    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 128    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 129    resulting sequence would be:
 130
 131    VS1: vb = px[i];
 132    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 133    VS2: va = vb;
 134    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 135
 136         Operands that are not SSA_NAMEs, are data-refs that appear in
 137    load/store operations (like 'x[i]' in S1), and are handled differently.
 138
 139    Target modeling:
 140    =================
 141         Currently the only target specific information that is used is the
 142    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 143    Targets that can support different sizes of vectors, for now will need
 144    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 145    flexibility will be added in the future.
 146
 147         Since we only vectorize operations which vector form can be
 148    expressed using existing tree codes, to verify that an operation is
 149    supported, the vectorizer checks the relevant optab at the relevant
 150    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 151    the value found is CODE_FOR_nothing, then there's no target support, and
 152    we can't vectorize the stmt.
 153
 154    For additional information on this project see:
 155    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 156 */
 157
 158 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 159                                                 unsigned *);
 160 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 161                                                bool *, bool *, bool);
 162
 163 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 164    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 165    may already be set for general statements (not just data refs).  */
 166
 167 static opt_result
 168 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 169                               bool vectype_maybe_set_p,
 170                               poly_uint64 *vf)
 171 {
 172   gimple *stmt = stmt_info->stmt;
 173
 174   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 175        && !STMT_VINFO_LIVE_P (stmt_info))
 176       || gimple_clobber_p (stmt))
 177     {
 178       if (dump_enabled_p ())
 179         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 180       return opt_result::success ();
 181     }
 182
 183   tree stmt_vectype, nunits_vectype;
 184   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 185                                                    &stmt_vectype,
 186                                                    &nunits_vectype);
 187   if (!res)
 188     return res;
 189
 190   if (stmt_vectype)
 191     {
 192       if (STMT_VINFO_VECTYPE (stmt_info))
 193         /* The only case when a vectype had been already set is for stmts
 194            that contain a data ref, or for "pattern-stmts" (stmts generated
 195            by the vectorizer to represent/replace a certain idiom).  */
 196         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 197                      || vectype_maybe_set_p)
 198                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 199       else
 200         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 201     }
 202
 203   if (nunits_vectype)
 204     vect_update_max_nunits (vf, nunits_vectype);
 205
 206   return opt_result::success ();
 207 }
 208
 209 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 210    types of STMT_INFO and all attached pattern statements and update
 211    the vectorization factor VF accordingly.  Return true on success
 212    or false if something prevented vectorization.  */
 213
 214 static opt_result
 215 vect_determine_vf_for_stmt (vec_info *vinfo,
 216                             stmt_vec_info stmt_info, poly_uint64 *vf)
 217 {
 218   if (dump_enabled_p ())
 219     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 220                      stmt_info->stmt);
 221   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 222   if (!res)
 223     return res;
 224
 225   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 226       && STMT_VINFO_RELATED_STMT (stmt_info))
 227     {
 228       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 229       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 230
 231       /* If a pattern statement has def stmts, analyze them too.  */
 232       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 233            !gsi_end_p (si); gsi_next (&si))
 234         {
 235           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 236           if (dump_enabled_p ())
 237             dump_printf_loc (MSG_NOTE, vect_location,
 238                              "==> examining pattern def stmt: %G",
 239                              def_stmt_info->stmt);
 240           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 241           if (!res)
 242             return res;
 243         }
 244
 245       if (dump_enabled_p ())
 246         dump_printf_loc (MSG_NOTE, vect_location,
 247                          "==> examining pattern statement: %G",
 248                          stmt_info->stmt);
 249       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 250       if (!res)
 251         return res;
 252     }
 253
 254   return opt_result::success ();
 255 }
 256
 257 /* Function vect_determine_vectorization_factor
 258
 259    Determine the vectorization factor (VF).  VF is the number of data elements
 260    that are operated upon in parallel in a single iteration of the vectorized
 261    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 262    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 263    elements can fit in a single vector register.
 264
 265    We currently support vectorization of loops in which all types operated upon
 266    are of the same size.  Therefore this function currently sets VF according to
 267    the size of the types operated upon, and fails if there are multiple sizes
 268    in the loop.
 269
 270    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 271    original loop:
 272         for (i=0; i<N; i++){
 273           a[i] = b[i] + c[i];
 274         }
 275
 276    vectorized loop:
 277         for (i=0; i<N; i+=VF){
 278           a[i:VF] = b[i:VF] + c[i:VF];
 279         }
 280 */
 281
 282 static opt_result
 283 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 284 {
 285   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 286   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 287   unsigned nbbs = loop->num_nodes;
 288   poly_uint64 vectorization_factor = 1;
 289   tree scalar_type = NULL_TREE;
 290   gphi *phi;
 291   tree vectype;
 292   stmt_vec_info stmt_info;
 293   unsigned i;
 294
 295   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 296
 297   for (i = 0; i < nbbs; i++)
 298     {
 299       basic_block bb = bbs[i];
 300
 301       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 302            gsi_next (&si))
 303         {
 304           phi = si.phi ();
 305           stmt_info = loop_vinfo->lookup_stmt (phi);
 306           if (dump_enabled_p ())
 307             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 308                              (gimple *) phi);
 309
 310           gcc_assert (stmt_info);
 311
 312           if (STMT_VINFO_RELEVANT_P (stmt_info)
 313               || STMT_VINFO_LIVE_P (stmt_info))
 314             {
 315               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 316               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 317
 318               if (dump_enabled_p ())
 319                 dump_printf_loc (MSG_NOTE, vect_location,
 320                                  "get vectype for scalar type:  %T\n",
 321                                  scalar_type);
 322
 323               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 324               if (!vectype)
 325                 return opt_result::failure_at (phi,
 326                                                "not vectorized: unsupported "
 327                                                "data-type %T\n",
 328                                                scalar_type);
 329               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 330
 331               if (dump_enabled_p ())
 332                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 333                                  vectype);
 334
 335               if (dump_enabled_p ())
 336                 {
 337                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 338                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 339                   dump_printf (MSG_NOTE, "\n");
 340                 }
 341
 342               vect_update_max_nunits (&vectorization_factor, vectype);
 343             }
 344         }
 345
 346       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 347            gsi_next (&si))
 348         {
 349           if (is_gimple_debug (gsi_stmt (si)))
 350             continue;
 351           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 352           opt_result res
 353             = vect_determine_vf_for_stmt (loop_vinfo,
 354                                           stmt_info, &vectorization_factor);
 355           if (!res)
 356             return res;
 357         }
 358     }
 359
 360   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 361   if (dump_enabled_p ())
 362     {
 363       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 364       dump_dec (MSG_NOTE, vectorization_factor);
 365       dump_printf (MSG_NOTE, "\n");
 366     }
 367
 368   if (known_le (vectorization_factor, 1U))
 369     return opt_result::failure_at (vect_location,
 370                                    "not vectorized: unsupported data-type\n");
 371   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 372   return opt_result::success ();
 373 }
 374
 375
 376 /* Function vect_is_simple_iv_evolution.
 377
 378    FORNOW: A simple evolution of an induction variables in the loop is
 379    considered a polynomial evolution.  */
 380
 381 static bool
 382 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 383                              tree * step)
 384 {
 385   tree init_expr;
 386   tree step_expr;
 387   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 388   basic_block bb;
 389
 390   /* When there is no evolution in this loop, the evolution function
 391      is not "simple".  */
 392   if (evolution_part == NULL_TREE)
 393     return false;
 394
 395   /* When the evolution is a polynomial of degree >= 2
 396      the evolution function is not "simple".  */
 397   if (tree_is_chrec (evolution_part))
 398     return false;
 399
 400   step_expr = evolution_part;
 401   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 402
 403   if (dump_enabled_p ())
 404     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 405                      step_expr, init_expr);
 406
 407   *init = init_expr;
 408   *step = step_expr;
 409
 410   if (TREE_CODE (step_expr) != INTEGER_CST
 411       && (TREE_CODE (step_expr) != SSA_NAME
 412           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 413               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 414           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 415               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 416                   || !flag_associative_math)))
 417       && (TREE_CODE (step_expr) != REAL_CST
 418           || !flag_associative_math))
 419     {
 420       if (dump_enabled_p ())
 421         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 422                          "step unknown.\n");
 423       return false;
 424     }
 425
 426   return true;
 427 }
 428
 429 /* Function vect_is_nonlinear_iv_evolution
 430
 431    Only support nonlinear induction for integer type
 432    1. neg
 433    2. mul by constant
 434    3. lshift/rshift by constant.
 435
 436    For neg induction, return a fake step as integer -1.  */
 437 static bool
 438 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
 439                                 gphi* loop_phi_node, tree *init, tree *step)
 440 {
 441   tree init_expr, ev_expr, result, op1, op2;
 442   gimple* def;
 443
 444   if (gimple_phi_num_args (loop_phi_node) != 2)
 445     return false;
 446
 447   init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
 448   ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
 449
 450   /* Support nonlinear induction only for integer type.  */
 451   if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
 452     return false;
 453
 454   *init = init_expr;
 455   result = PHI_RESULT (loop_phi_node);
 456
 457   if (TREE_CODE (ev_expr) != SSA_NAME
 458       || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
 459       || !is_gimple_assign (def))
 460     return false;
 461
 462   enum tree_code t_code = gimple_assign_rhs_code (def);
 463   switch (t_code)
 464     {
 465     case NEGATE_EXPR:
 466       if (gimple_assign_rhs1 (def) != result)
 467         return false;
 468       *step = build_int_cst (TREE_TYPE (init_expr), -1);
 469       STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
 470       break;
 471
 472     case RSHIFT_EXPR:
 473     case LSHIFT_EXPR:
 474     case MULT_EXPR:
 475       op1 = gimple_assign_rhs1 (def);
 476       op2 = gimple_assign_rhs2 (def);
 477       if (TREE_CODE (op2) != INTEGER_CST
 478           || op1 != result)
 479         return false;
 480       *step = op2;
 481       if (t_code == LSHIFT_EXPR)
 482         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
 483       else if (t_code == RSHIFT_EXPR)
 484         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
 485       /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul.  */
 486       else
 487         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
 488       break;
 489
 490     default:
 491       return false;
 492     }
 493
 494   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
 495   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
 496
 497   return true;
 498 }
 499
 500 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 501    what we are assuming is a double reduction.  For example, given
 502    a structure like this:
 503
 504       outer1:
 505         x_1 = PHI <x_4(outer2), ...>;
 506         ...
 507
 508       inner:
 509         x_2 = PHI <x_1(outer1), ...>;
 510         ...
 511         x_3 = ...;
 512         ...
 513
 514       outer2:
 515         x_4 = PHI <x_3(inner)>;
 516         ...
 517
 518    outer loop analysis would treat x_1 as a double reduction phi and
 519    this function would then return true for x_2.  */
 520
 521 static bool
 522 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 523 {
 524   use_operand_p use_p;
 525   ssa_op_iter op_iter;
 526   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 527     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 528       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 529         return true;
 530   return false;
 531 }
 532
 533 /* Returns true if Phi is a first-order recurrence. A first-order
 534    recurrence is a non-reduction recurrence relation in which the value of
 535    the recurrence in the current loop iteration equals a value defined in
 536    the previous iteration.  */
 537
 538 static bool
 539 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
 540                                    gphi *phi)
 541 {
 542   /* A nested cycle isn't vectorizable as first order recurrence.  */
 543   if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
 544     return false;
 545
 546   /* Ensure the loop latch definition is from within the loop.  */
 547   edge latch = loop_latch_edge (loop);
 548   tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
 549   if (TREE_CODE (ldef) != SSA_NAME
 550       || SSA_NAME_IS_DEFAULT_DEF (ldef)
 551       || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
 552       || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
 553     return false;
 554
 555   tree def = gimple_phi_result (phi);
 556
 557   /* Ensure every use_stmt of the phi node is dominated by the latch
 558      definition.  */
 559   imm_use_iterator imm_iter;
 560   use_operand_p use_p;
 561   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
 562     if (!is_gimple_debug (USE_STMT (use_p))
 563         && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
 564             || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
 565                                             USE_STMT (use_p))))
 566       return false;
 567
 568   /* First-order recurrence autovectorization needs shuffle vector.  */
 569   tree scalar_type = TREE_TYPE (def);
 570   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 571   if (!vectype)
 572     return false;
 573
 574   return true;
 575 }
 576
 577 /* Function vect_analyze_scalar_cycles_1.
 578
 579    Examine the cross iteration def-use cycles of scalar variables
 580    in LOOP.  LOOP_VINFO represents the loop that is now being
 581    considered for vectorization (can be LOOP, or an outer-loop
 582    enclosing LOOP).  SLP indicates there will be some subsequent
 583    slp analyses or not.  */
 584
 585 static void
 586 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
 587                               bool slp)
 588 {
 589   basic_block bb = loop->header;
 590   tree init, step;
 591   auto_vec<stmt_vec_info, 64> worklist;
 592   gphi_iterator gsi;
 593   bool double_reduc, reduc_chain;
 594
 595   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 596
 597   /* First - identify all inductions.  Reduction detection assumes that all the
 598      inductions have been identified, therefore, this order must not be
 599      changed.  */
 600   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 601     {
 602       gphi *phi = gsi.phi ();
 603       tree access_fn = NULL;
 604       tree def = PHI_RESULT (phi);
 605       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 606
 607       if (dump_enabled_p ())
 608         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 609                          (gimple *) phi);
 610
 611       /* Skip virtual phi's.  The data dependences that are associated with
 612          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 613       if (virtual_operand_p (def))
 614         continue;
 615
 616       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 617
 618       /* Analyze the evolution function.  */
 619       access_fn = analyze_scalar_evolution (loop, def);
 620       if (access_fn)
 621         {
 622           STRIP_NOPS (access_fn);
 623           if (dump_enabled_p ())
 624             dump_printf_loc (MSG_NOTE, vect_location,
 625                              "Access function of PHI: %T\n", access_fn);
 626           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 627             = initial_condition_in_loop_num (access_fn, loop->num);
 628           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 629             = evolution_part_in_loop_num (access_fn, loop->num);
 630         }
 631
 632       if ((!access_fn
 633            || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 634            || !vect_is_simple_iv_evolution (loop->num, access_fn,
 635                                             &init, &step)
 636            || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 637                && TREE_CODE (step) != INTEGER_CST))
 638           /* Only handle nonlinear iv for same loop.  */
 639           && (LOOP_VINFO_LOOP (loop_vinfo) != loop
 640               || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
 641                                                   phi, &init, &step)))
 642         {
 643           worklist.safe_push (stmt_vinfo);
 644           continue;
 645         }
 646
 647       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 648                   != NULL_TREE);
 649       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 650
 651       if (dump_enabled_p ())
 652         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 653       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 654     }
 655
 656
 657   /* Second - identify all reductions and nested cycles.  */
 658   while (worklist.length () > 0)
 659     {
 660       stmt_vec_info stmt_vinfo = worklist.pop ();
 661       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 662       tree def = PHI_RESULT (phi);
 663
 664       if (dump_enabled_p ())
 665         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 666                          (gimple *) phi);
 667
 668       gcc_assert (!virtual_operand_p (def)
 669                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 670
 671       stmt_vec_info reduc_stmt_info
 672         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 673                                     &reduc_chain, slp);
 674       if (reduc_stmt_info)
 675         {
 676           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 677           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 678           if (double_reduc)
 679             {
 680               if (dump_enabled_p ())
 681                 dump_printf_loc (MSG_NOTE, vect_location,
 682                                  "Detected double reduction.\n");
 683
 684               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 685               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 686             }
 687           else
 688             {
 689               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 690                 {
 691                   if (dump_enabled_p ())
 692                     dump_printf_loc (MSG_NOTE, vect_location,
 693                                      "Detected vectorizable nested cycle.\n");
 694
 695                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 696                 }
 697               else
 698                 {
 699                   if (dump_enabled_p ())
 700                     dump_printf_loc (MSG_NOTE, vect_location,
 701                                      "Detected reduction.\n");
 702
 703                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 704                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 705                   /* Store the reduction cycles for possible vectorization in
 706                      loop-aware SLP if it was not detected as reduction
 707                      chain.  */
 708                   if (! reduc_chain)
 709                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 710                       (reduc_stmt_info);
 711                 }
 712             }
 713         }
 714       else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
 715         STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
 716       else
 717         if (dump_enabled_p ())
 718           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 719                            "Unknown def-use cycle pattern.\n");
 720     }
 721 }
 722
 723
 724 /* Function vect_analyze_scalar_cycles.
 725
 726    Examine the cross iteration def-use cycles of scalar variables, by
 727    analyzing the loop-header PHIs of scalar variables.  Classify each
 728    cycle as one of the following: invariant, induction, reduction, unknown.
 729    We do that for the loop represented by LOOP_VINFO, and also to its
 730    inner-loop, if exists.
 731    Examples for scalar cycles:
 732
 733    Example1: reduction:
 734
 735               loop1:
 736               for (i=0; i<N; i++)
 737                  sum += a[i];
 738
 739    Example2: induction:
 740
 741               loop2:
 742               for (i=0; i<N; i++)
 743                  a[i] = i;  */
 744
 745 static void
 746 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
 747 {
 748   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 749
 750   vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
 751
 752   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 753      Reductions in such inner-loop therefore have different properties than
 754      the reductions in the nest that gets vectorized:
 755      1. When vectorized, they are executed in the same order as in the original
 756         scalar loop, so we can't change the order of computation when
 757         vectorizing them.
 758      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 759         current checks are too strict.  */
 760
 761   if (loop->inner)
 762     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
 763 }
 764
 765 /* Transfer group and reduction information from STMT_INFO to its
 766    pattern stmt.  */
 767
 768 static void
 769 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 770 {
 771   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 772   stmt_vec_info stmtp;
 773   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 774               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 775   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 776   do
 777     {
 778       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 779       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 780                            == STMT_VINFO_DEF_TYPE (stmt_info));
 781       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 782       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 783       if (stmt_info)
 784         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 785           = STMT_VINFO_RELATED_STMT (stmt_info);
 786     }
 787   while (stmt_info);
 788 }
 789
 790 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 791
 792 static void
 793 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 794 {
 795   stmt_vec_info first;
 796   unsigned i;
 797
 798   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 799     {
 800       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 801       while (next)
 802         {
 803           if ((STMT_VINFO_IN_PATTERN_P (next)
 804                != STMT_VINFO_IN_PATTERN_P (first))
 805               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 806             break;
 807           next = REDUC_GROUP_NEXT_ELEMENT (next);
 808         }
 809       /* If all reduction chain members are well-formed patterns adjust
 810          the group to group the pattern stmts instead.  */
 811       if (! next
 812           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 813         {
 814           if (STMT_VINFO_IN_PATTERN_P (first))
 815             {
 816               vect_fixup_reduc_chain (first);
 817               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 818                 = STMT_VINFO_RELATED_STMT (first);
 819             }
 820         }
 821       /* If not all stmt in the chain are patterns or if we failed
 822          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 823          it as regular reduction instead.  */
 824       else
 825         {
 826           stmt_vec_info vinfo = first;
 827           stmt_vec_info last = NULL;
 828           while (vinfo)
 829             {
 830               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 831               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 832               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 833               last = vinfo;
 834               vinfo = next;
 835             }
 836           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 837             = vect_internal_def;
 838           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 839           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 840           --i;
 841         }
 842     }
 843 }
 844
 845 /* Function vect_get_loop_niters.
 846
 847    Determine how many iterations the loop is executed and place it
 848    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 849    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 850    niter information holds in ASSUMPTIONS.
 851
 852    Return the loop exit condition.  */
 853
 854
 855 static gcond *
 856 vect_get_loop_niters (class loop *loop, tree *assumptions,
 857                       tree *number_of_iterations, tree *number_of_iterationsm1)
 858 {
 859   edge exit = single_exit (loop);
 860   class tree_niter_desc niter_desc;
 861   tree niter_assumptions, niter, may_be_zero;
 862   gcond *cond = get_loop_exit_condition (loop);
 863
 864   *assumptions = boolean_true_node;
 865   *number_of_iterationsm1 = chrec_dont_know;
 866   *number_of_iterations = chrec_dont_know;
 867   DUMP_VECT_SCOPE ("get_loop_niters");
 868
 869   if (!exit)
 870     return cond;
 871
 872   may_be_zero = NULL_TREE;
 873   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 874       || chrec_contains_undetermined (niter_desc.niter))
 875     return cond;
 876
 877   niter_assumptions = niter_desc.assumptions;
 878   may_be_zero = niter_desc.may_be_zero;
 879   niter = niter_desc.niter;
 880
 881   if (may_be_zero && integer_zerop (may_be_zero))
 882     may_be_zero = NULL_TREE;
 883
 884   if (may_be_zero)
 885     {
 886       if (COMPARISON_CLASS_P (may_be_zero))
 887         {
 888           /* Try to combine may_be_zero with assumptions, this can simplify
 889              computation of niter expression.  */
 890           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 891             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 892                                              niter_assumptions,
 893                                              fold_build1 (TRUTH_NOT_EXPR,
 894                                                           boolean_type_node,
 895                                                           may_be_zero));
 896           else
 897             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 898                                  build_int_cst (TREE_TYPE (niter), 0),
 899                                  rewrite_to_non_trapping_overflow (niter));
 900
 901           may_be_zero = NULL_TREE;
 902         }
 903       else if (integer_nonzerop (may_be_zero))
 904         {
 905           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 906           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 907           return cond;
 908         }
 909       else
 910         return cond;
 911     }
 912
 913   *assumptions = niter_assumptions;
 914   *number_of_iterationsm1 = niter;
 915
 916   /* We want the number of loop header executions which is the number
 917      of latch executions plus one.
 918      ???  For UINT_MAX latch executions this number overflows to zero
 919      for loops like do { n++; } while (n != 0);  */
 920   if (niter && !chrec_contains_undetermined (niter))
 921     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 922                           build_int_cst (TREE_TYPE (niter), 1));
 923   *number_of_iterations = niter;
 924
 925   return cond;
 926 }
 927
 928 /* Function bb_in_loop_p
 929
 930    Used as predicate for dfs order traversal of the loop bbs.  */
 931
 932 static bool
 933 bb_in_loop_p (const_basic_block bb, const void *data)
 934 {
 935   const class loop *const loop = (const class loop *)data;
 936   if (flow_bb_inside_loop_p (loop, bb))
 937     return true;
 938   return false;
 939 }
 940
 941
 942 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 943    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 944
 945 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 946   : vec_info (vec_info::loop, shared),
 947     loop (loop_in),
 948     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 949     num_itersm1 (NULL_TREE),
 950     num_iters (NULL_TREE),
 951     num_iters_unchanged (NULL_TREE),
 952     num_iters_assumptions (NULL_TREE),
 953     vector_costs (nullptr),
 954     scalar_costs (nullptr),
 955     th (0),
 956     versioning_threshold (0),
 957     vectorization_factor (0),
 958     main_loop_edge (nullptr),
 959     skip_main_loop_edge (nullptr),
 960     skip_this_loop_edge (nullptr),
 961     reusable_accumulators (),
 962     suggested_unroll_factor (1),
 963     max_vectorization_factor (0),
 964     mask_skip_niters (NULL_TREE),
 965     rgroup_compare_type (NULL_TREE),
 966     simd_if_cond (NULL_TREE),
 967     partial_vector_style (vect_partial_vectors_none),
 968     unaligned_dr (NULL),
 969     peeling_for_alignment (0),
 970     ptr_mask (0),
 971     ivexpr_map (NULL),
 972     scan_map (NULL),
 973     slp_unrolling_factor (1),
 974     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
 975     vectorizable (false),
 976     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
 977     using_partial_vectors_p (false),
 978     using_decrementing_iv_p (false),
 979     using_select_vl_p (false),
 980     epil_using_partial_vectors_p (false),
 981     partial_load_store_bias (0),
 982     peeling_for_gaps (false),
 983     peeling_for_niter (false),
 984     no_data_dependencies (false),
 985     has_mask_store (false),
 986     scalar_loop_scaling (profile_probability::uninitialized ()),
 987     scalar_loop (NULL),
 988     orig_loop_info (NULL)
 989 {
 990   /* CHECKME: We want to visit all BBs before their successors (except for
 991      latch blocks, for which this assertion wouldn't hold).  In the simple
 992      case of the loop forms we allow, a dfs order of the BBs would the same
 993      as reversed postorder traversal, so we are safe.  */
 994
 995   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 996                                           bbs, loop->num_nodes, loop);
 997   gcc_assert (nbbs == loop->num_nodes);
 998
 999   for (unsigned int i = 0; i < nbbs; i++)
1000     {
1001       basic_block bb = bbs[i];
1002       gimple_stmt_iterator si;
1003
1004       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1005         {
1006           gimple *phi = gsi_stmt (si);
1007           gimple_set_uid (phi, 0);
1008           add_stmt (phi);
1009         }
1010
1011       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1012         {
1013           gimple *stmt = gsi_stmt (si);
1014           gimple_set_uid (stmt, 0);
1015           if (is_gimple_debug (stmt))
1016             continue;
1017           add_stmt (stmt);
1018           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1019              third argument is the #pragma omp simd if (x) condition, when 0,
1020              loop shouldn't be vectorized, when non-zero constant, it should
1021              be vectorized normally, otherwise versioned with vectorized loop
1022              done if the condition is non-zero at runtime.  */
1023           if (loop_in->simduid
1024               && is_gimple_call (stmt)
1025               && gimple_call_internal_p (stmt)
1026               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1027               && gimple_call_num_args (stmt) >= 3
1028               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1029               && (loop_in->simduid
1030                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1031             {
1032               tree arg = gimple_call_arg (stmt, 2);
1033               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1034                 simd_if_cond = arg;
1035               else
1036                 gcc_assert (integer_nonzerop (arg));
1037             }
1038         }
1039     }
1040
1041   epilogue_vinfos.create (6);
1042 }
1043
1044 /* Free all levels of rgroup CONTROLS.  */
1045
1046 void
1047 release_vec_loop_controls (vec<rgroup_controls> *controls)
1048 {
1049   rgroup_controls *rgc;
1050   unsigned int i;
1051   FOR_EACH_VEC_ELT (*controls, i, rgc)
1052     rgc->controls.release ();
1053   controls->release ();
1054 }
1055
1056 /* Free all memory used by the _loop_vec_info, as well as all the
1057    stmt_vec_info structs of all the stmts in the loop.  */
1058
1059 _loop_vec_info::~_loop_vec_info ()
1060 {
1061   free (bbs);
1062
1063   release_vec_loop_controls (&masks.rgc_vec);
1064   release_vec_loop_controls (&lens);
1065   delete ivexpr_map;
1066   delete scan_map;
1067   epilogue_vinfos.release ();
1068   delete scalar_costs;
1069   delete vector_costs;
1070
1071   /* When we release an epiloge vinfo that we do not intend to use
1072      avoid clearing AUX of the main loop which should continue to
1073      point to the main loop vinfo since otherwise we'll leak that.  */
1074   if (loop->aux == this)
1075     loop->aux = NULL;
1076 }
1077
1078 /* Return an invariant or register for EXPR and emit necessary
1079    computations in the LOOP_VINFO loop preheader.  */
1080
1081 tree
1082 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1083 {
1084   if (is_gimple_reg (expr)
1085       || is_gimple_min_invariant (expr))
1086     return expr;
1087
1088   if (! loop_vinfo->ivexpr_map)
1089     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1090   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1091   if (! cached)
1092     {
1093       gimple_seq stmts = NULL;
1094       cached = force_gimple_operand (unshare_expr (expr),
1095                                      &stmts, true, NULL_TREE);
1096       if (stmts)
1097         {
1098           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1099           gsi_insert_seq_on_edge_immediate (e, stmts);
1100         }
1101     }
1102   return cached;
1103 }
1104
1105 /* Return true if we can use CMP_TYPE as the comparison type to produce
1106    all masks required to mask LOOP_VINFO.  */
1107
1108 static bool
1109 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1110 {
1111   rgroup_controls *rgm;
1112   unsigned int i;
1113   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1114     if (rgm->type != NULL_TREE
1115         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1116                                             cmp_type, rgm->type,
1117                                             OPTIMIZE_FOR_SPEED))
1118       return false;
1119   return true;
1120 }
1121
1122 /* Calculate the maximum number of scalars per iteration for every
1123    rgroup in LOOP_VINFO.  */
1124
1125 static unsigned int
1126 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1127 {
1128   unsigned int res = 1;
1129   unsigned int i;
1130   rgroup_controls *rgm;
1131   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1132     res = MAX (res, rgm->max_nscalars_per_iter);
1133   return res;
1134 }
1135
1136 /* Calculate the minimum precision necessary to represent:
1137
1138       MAX_NITERS * FACTOR
1139
1140    as an unsigned integer, where MAX_NITERS is the maximum number of
1141    loop header iterations for the original scalar form of LOOP_VINFO.  */
1142
1143 static unsigned
1144 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1145 {
1146   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1147
1148   /* Get the maximum number of iterations that is representable
1149      in the counter type.  */
1150   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1151   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1152
1153   /* Get a more refined estimate for the number of iterations.  */
1154   widest_int max_back_edges;
1155   if (max_loop_iterations (loop, &max_back_edges))
1156     max_ni = wi::smin (max_ni, max_back_edges + 1);
1157
1158   /* Work out how many bits we need to represent the limit.  */
1159   return wi::min_precision (max_ni * factor, UNSIGNED);
1160 }
1161
1162 /* True if the loop needs peeling or partial vectors when vectorized.  */
1163
1164 static bool
1165 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1166 {
1167   unsigned HOST_WIDE_INT const_vf;
1168   HOST_WIDE_INT max_niter
1169     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1170
1171   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1172   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1173     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1174                                           (loop_vinfo));
1175
1176   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1177       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1178     {
1179       /* Work out the (constant) number of iterations that need to be
1180          peeled for reasons other than niters.  */
1181       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1182       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1183         peel_niter += 1;
1184       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1185                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1186         return true;
1187     }
1188   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1189       /* ??? When peeling for gaps but not alignment, we could
1190          try to check whether the (variable) niters is known to be
1191          VF * N + 1.  That's something of a niche case though.  */
1192       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1193       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1194       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1195            < (unsigned) exact_log2 (const_vf))
1196           /* In case of versioning, check if the maximum number of
1197              iterations is greater than th.  If they are identical,
1198              the epilogue is unnecessary.  */
1199           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1200               || ((unsigned HOST_WIDE_INT) max_niter
1201                   > (th / const_vf) * const_vf))))
1202     return true;
1203
1204   return false;
1205 }
1206
1207 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1208    whether we can actually generate the masks required.  Return true if so,
1209    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1210
1211 static bool
1212 vect_verify_full_masking (loop_vec_info loop_vinfo)
1213 {
1214   unsigned int min_ni_width;
1215
1216   /* Use a normal loop if there are no statements that need masking.
1217      This only happens in rare degenerate cases: it means that the loop
1218      has no loads, no stores, and no live-out values.  */
1219   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1220     return false;
1221
1222   /* Produce the rgroup controls.  */
1223   for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1224     {
1225       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1226       tree vectype = mask.first;
1227       unsigned nvectors = mask.second;
1228
1229       if (masks->rgc_vec.length () < nvectors)
1230         masks->rgc_vec.safe_grow_cleared (nvectors, true);
1231       rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1232       /* The number of scalars per iteration and the number of vectors are
1233          both compile-time constants.  */
1234       unsigned int nscalars_per_iter
1235           = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1236                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1237
1238       if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1239         {
1240           rgm->max_nscalars_per_iter = nscalars_per_iter;
1241           rgm->type = truth_type_for (vectype);
1242           rgm->factor = 1;
1243         }
1244     }
1245
1246   unsigned int max_nscalars_per_iter
1247     = vect_get_max_nscalars_per_iter (loop_vinfo);
1248
1249   /* Work out how many bits we need to represent the limit.  */
1250   min_ni_width
1251     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1252
1253   /* Find a scalar mode for which WHILE_ULT is supported.  */
1254   opt_scalar_int_mode cmp_mode_iter;
1255   tree cmp_type = NULL_TREE;
1256   tree iv_type = NULL_TREE;
1257   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1258   unsigned int iv_precision = UINT_MAX;
1259
1260   if (iv_limit != -1)
1261     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1262                                       UNSIGNED);
1263
1264   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1265     {
1266       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1267       if (cmp_bits >= min_ni_width
1268           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1269         {
1270           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1271           if (this_type
1272               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1273             {
1274               /* Although we could stop as soon as we find a valid mode,
1275                  there are at least two reasons why that's not always the
1276                  best choice:
1277
1278                  - An IV that's Pmode or wider is more likely to be reusable
1279                    in address calculations than an IV that's narrower than
1280                    Pmode.
1281
1282                  - Doing the comparison in IV_PRECISION or wider allows
1283                    a natural 0-based IV, whereas using a narrower comparison
1284                    type requires mitigations against wrap-around.
1285
1286                  Conversely, if the IV limit is variable, doing the comparison
1287                  in a wider type than the original type can introduce
1288                  unnecessary extensions, so picking the widest valid mode
1289                  is not always a good choice either.
1290
1291                  Here we prefer the first IV type that's Pmode or wider,
1292                  and the first comparison type that's IV_PRECISION or wider.
1293                  (The comparison type must be no wider than the IV type,
1294                  to avoid extensions in the vector loop.)
1295
1296                  ??? We might want to try continuing beyond Pmode for ILP32
1297                  targets if CMP_BITS < IV_PRECISION.  */
1298               iv_type = this_type;
1299               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1300                 cmp_type = this_type;
1301               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1302                 break;
1303             }
1304         }
1305     }
1306
1307   if (!cmp_type)
1308     {
1309       LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1310       return false;
1311     }
1312
1313   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1314   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1315   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1316   return true;
1317 }
1318
1319 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1320    whether we can actually generate AVX512 style masks.  Return true if so,
1321    storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE.  */
1322
1323 static bool
1324 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1325 {
1326   /* Produce differently organized rgc_vec and differently check
1327      we can produce masks.  */
1328
1329   /* Use a normal loop if there are no statements that need masking.
1330      This only happens in rare degenerate cases: it means that the loop
1331      has no loads, no stores, and no live-out values.  */
1332   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1333     return false;
1334
1335   /* For the decrementing IV we need to represent all values in
1336      [0, niter + niter_skip] where niter_skip is the elements we
1337      skip in the first iteration for prologue peeling.  */
1338   tree iv_type = NULL_TREE;
1339   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1340   unsigned int iv_precision = UINT_MAX;
1341   if (iv_limit != -1)
1342     iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1343
1344   /* First compute the type for the IV we use to track the remaining
1345      scalar iterations.  */
1346   opt_scalar_int_mode cmp_mode_iter;
1347   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1348     {
1349       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1350       if (cmp_bits >= iv_precision
1351           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1352         {
1353           iv_type = build_nonstandard_integer_type (cmp_bits, true);
1354           if (iv_type)
1355             break;
1356         }
1357     }
1358   if (!iv_type)
1359     return false;
1360
1361   /* Produce the rgroup controls.  */
1362   for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1363     {
1364       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1365       tree vectype = mask.first;
1366       unsigned nvectors = mask.second;
1367
1368       /* The number of scalars per iteration and the number of vectors are
1369          both compile-time constants.  */
1370       unsigned int nscalars_per_iter
1371         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1372                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1373
1374       /* We index the rgroup_controls vector with nscalars_per_iter
1375          which we keep constant and instead have a varying nvectors,
1376          remembering the vector mask with the fewest nV.  */
1377       if (masks->rgc_vec.length () < nscalars_per_iter)
1378         masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1379       rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1380
1381       if (!rgm->type || rgm->factor > nvectors)
1382         {
1383           rgm->type = truth_type_for (vectype);
1384           rgm->compare_type = NULL_TREE;
1385           rgm->max_nscalars_per_iter = nscalars_per_iter;
1386           rgm->factor = nvectors;
1387           rgm->bias_adjusted_ctrl = NULL_TREE;
1388         }
1389     }
1390
1391   /* There is no fixed compare type we are going to use but we have to
1392      be able to get at one for each mask group.  */
1393   unsigned int min_ni_width
1394     = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1395
1396   bool ok = true;
1397   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1398     {
1399       tree mask_type = rgc.type;
1400       if (!mask_type)
1401         continue;
1402
1403       if (TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1404         {
1405           ok = false;
1406           break;
1407         }
1408
1409       /* If iv_type is usable as compare type use that - we can elide the
1410          saturation in that case.   */
1411       if (TYPE_PRECISION (iv_type) >= min_ni_width)
1412         {
1413           tree cmp_vectype
1414             = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1415           if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1416             rgc.compare_type = cmp_vectype;
1417         }
1418       if (!rgc.compare_type)
1419         FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1420           {
1421             unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1422             if (cmp_bits >= min_ni_width
1423                 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1424               {
1425                 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1426                 if (!cmp_type)
1427                   continue;
1428
1429                 /* Check whether we can produce the mask with cmp_type.  */
1430                 tree cmp_vectype
1431                   = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1432                 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1433                   {
1434                     rgc.compare_type = cmp_vectype;
1435                     break;
1436                   }
1437               }
1438         }
1439       if (!rgc.compare_type)
1440         {
1441           ok = false;
1442           break;
1443         }
1444     }
1445   if (!ok)
1446     {
1447       release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1448       return false;
1449     }
1450
1451   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1452   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1453   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1454   return true;
1455 }
1456
1457 /* Check whether we can use vector access with length based on precison
1458    comparison.  So far, to keep it simple, we only allow the case that the
1459    precision of the target supported length is larger than the precision
1460    required by loop niters.  */
1461
1462 static bool
1463 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1464 {
1465   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1466     return false;
1467
1468   machine_mode len_load_mode = get_len_load_store_mode
1469     (loop_vinfo->vector_mode, true).require ();
1470   machine_mode len_store_mode = get_len_load_store_mode
1471     (loop_vinfo->vector_mode, false).require ();
1472
1473   signed char partial_load_bias = internal_len_load_store_bias
1474     (IFN_LEN_LOAD, len_load_mode);
1475
1476   signed char partial_store_bias = internal_len_load_store_bias
1477     (IFN_LEN_STORE, len_store_mode);
1478
1479   gcc_assert (partial_load_bias == partial_store_bias);
1480
1481   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1482     return false;
1483
1484   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1485      len_loads with a length of zero.  In order to avoid that we prohibit
1486      more than one loop length here.  */
1487   if (partial_load_bias == -1
1488       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1489     return false;
1490
1491   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1492
1493   unsigned int max_nitems_per_iter = 1;
1494   unsigned int i;
1495   rgroup_controls *rgl;
1496   /* Find the maximum number of items per iteration for every rgroup.  */
1497   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1498     {
1499       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1500       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1501     }
1502
1503   /* Work out how many bits we need to represent the length limit.  */
1504   unsigned int min_ni_prec
1505     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1506
1507   /* Now use the maximum of below precisions for one suitable IV type:
1508      - the IV's natural precision
1509      - the precision needed to hold: the maximum number of scalar
1510        iterations multiplied by the scale factor (min_ni_prec above)
1511      - the Pmode precision
1512
1513      If min_ni_prec is less than the precision of the current niters,
1514      we perfer to still use the niters type.  Prefer to use Pmode and
1515      wider IV to avoid narrow conversions.  */
1516
1517   unsigned int ni_prec
1518     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1519   min_ni_prec = MAX (min_ni_prec, ni_prec);
1520   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1521
1522   tree iv_type = NULL_TREE;
1523   opt_scalar_int_mode tmode_iter;
1524   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1525     {
1526       scalar_mode tmode = tmode_iter.require ();
1527       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1528
1529       /* ??? Do we really want to construct one IV whose precision exceeds
1530          BITS_PER_WORD?  */
1531       if (tbits > BITS_PER_WORD)
1532         break;
1533
1534       /* Find the first available standard integral type.  */
1535       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1536         {
1537           iv_type = build_nonstandard_integer_type (tbits, true);
1538           break;
1539         }
1540     }
1541
1542   if (!iv_type)
1543     {
1544       if (dump_enabled_p ())
1545         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1546                          "can't vectorize with length-based partial vectors"
1547                          " because there is no suitable iv type.\n");
1548       return false;
1549     }
1550
1551   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1552   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1553   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1554
1555   return true;
1556 }
1557
1558 /* Calculate the cost of one scalar iteration of the loop.  */
1559 static void
1560 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1561 {
1562   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1563   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1564   int nbbs = loop->num_nodes, factor;
1565   int innerloop_iters, i;
1566
1567   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1568
1569   /* Gather costs for statements in the scalar loop.  */
1570
1571   /* FORNOW.  */
1572   innerloop_iters = 1;
1573   if (loop->inner)
1574     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1575
1576   for (i = 0; i < nbbs; i++)
1577     {
1578       gimple_stmt_iterator si;
1579       basic_block bb = bbs[i];
1580
1581       if (bb->loop_father == loop->inner)
1582         factor = innerloop_iters;
1583       else
1584         factor = 1;
1585
1586       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1587         {
1588           gimple *stmt = gsi_stmt (si);
1589           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1590
1591           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1592             continue;
1593
1594           /* Skip stmts that are not vectorized inside the loop.  */
1595           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1596           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1597               && (!STMT_VINFO_LIVE_P (vstmt_info)
1598                   || !VECTORIZABLE_CYCLE_DEF
1599                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1600             continue;
1601
1602           vect_cost_for_stmt kind;
1603           if (STMT_VINFO_DATA_REF (stmt_info))
1604             {
1605               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1606                kind = scalar_load;
1607              else
1608                kind = scalar_store;
1609             }
1610           else if (vect_nop_conversion_p (stmt_info))
1611             continue;
1612           else
1613             kind = scalar_stmt;
1614
1615           /* We are using vect_prologue here to avoid scaling twice
1616              by the inner loop factor.  */
1617           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1618                             factor, kind, stmt_info, 0, vect_prologue);
1619         }
1620     }
1621
1622   /* Now accumulate cost.  */
1623   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1624   add_stmt_costs (loop_vinfo->scalar_costs,
1625                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1626   loop_vinfo->scalar_costs->finish_cost (nullptr);
1627 }
1628
1629
1630 /* Function vect_analyze_loop_form.
1631
1632    Verify that certain CFG restrictions hold, including:
1633    - the loop has a pre-header
1634    - the loop has a single entry and exit
1635    - the loop exit condition is simple enough
1636    - the number of iterations can be analyzed, i.e, a countable loop.  The
1637      niter could be analyzed under some assumptions.  */
1638
1639 opt_result
1640 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1641 {
1642   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1643
1644   /* Different restrictions apply when we are considering an inner-most loop,
1645      vs. an outer (nested) loop.
1646      (FORNOW. May want to relax some of these restrictions in the future).  */
1647
1648   info->inner_loop_cond = NULL;
1649   if (!loop->inner)
1650     {
1651       /* Inner-most loop.  We currently require that the number of BBs is
1652          exactly 2 (the header and latch).  Vectorizable inner-most loops
1653          look like this:
1654
1655                         (pre-header)
1656                            |
1657                           header <--------+
1658                            | |            |
1659                            | +--> latch --+
1660                            |
1661                         (exit-bb)  */
1662
1663       if (loop->num_nodes != 2)
1664         return opt_result::failure_at (vect_location,
1665                                        "not vectorized:"
1666                                        " control flow in loop.\n");
1667
1668       if (empty_block_p (loop->header))
1669         return opt_result::failure_at (vect_location,
1670                                        "not vectorized: empty loop.\n");
1671     }
1672   else
1673     {
1674       class loop *innerloop = loop->inner;
1675       edge entryedge;
1676
1677       /* Nested loop. We currently require that the loop is doubly-nested,
1678          contains a single inner loop, and the number of BBs is exactly 5.
1679          Vectorizable outer-loops look like this:
1680
1681                         (pre-header)
1682                            |
1683                           header <---+
1684                            |         |
1685                           inner-loop |
1686                            |         |
1687                           tail ------+
1688                            |
1689                         (exit-bb)
1690
1691          The inner-loop has the properties expected of inner-most loops
1692          as described above.  */
1693
1694       if ((loop->inner)->inner || (loop->inner)->next)
1695         return opt_result::failure_at (vect_location,
1696                                        "not vectorized:"
1697                                        " multiple nested loops.\n");
1698
1699       if (loop->num_nodes != 5)
1700         return opt_result::failure_at (vect_location,
1701                                        "not vectorized:"
1702                                        " control flow in loop.\n");
1703
1704       entryedge = loop_preheader_edge (innerloop);
1705       if (entryedge->src != loop->header
1706           || !single_exit (innerloop)
1707           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1708         return opt_result::failure_at (vect_location,
1709                                        "not vectorized:"
1710                                        " unsupported outerloop form.\n");
1711
1712       /* Analyze the inner-loop.  */
1713       vect_loop_form_info inner;
1714       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1715       if (!res)
1716         {
1717           if (dump_enabled_p ())
1718             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1719                              "not vectorized: Bad inner loop.\n");
1720           return res;
1721         }
1722
1723       /* Don't support analyzing niter under assumptions for inner
1724          loop.  */
1725       if (!integer_onep (inner.assumptions))
1726         return opt_result::failure_at (vect_location,
1727                                        "not vectorized: Bad inner loop.\n");
1728
1729       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1730         return opt_result::failure_at (vect_location,
1731                                        "not vectorized: inner-loop count not"
1732                                        " invariant.\n");
1733
1734       if (dump_enabled_p ())
1735         dump_printf_loc (MSG_NOTE, vect_location,
1736                          "Considering outer-loop vectorization.\n");
1737       info->inner_loop_cond = inner.loop_cond;
1738     }
1739
1740   if (!single_exit (loop))
1741     return opt_result::failure_at (vect_location,
1742                                    "not vectorized: multiple exits.\n");
1743   if (EDGE_COUNT (loop->header->preds) != 2)
1744     return opt_result::failure_at (vect_location,
1745                                    "not vectorized:"
1746                                    " too many incoming edges.\n");
1747
1748   /* We assume that the loop exit condition is at the end of the loop. i.e,
1749      that the loop is represented as a do-while (with a proper if-guard
1750      before the loop if needed), where the loop header contains all the
1751      executable statements, and the latch is empty.  */
1752   if (!empty_block_p (loop->latch)
1753       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1754     return opt_result::failure_at (vect_location,
1755                                    "not vectorized: latch block not empty.\n");
1756
1757   /* Make sure the exit is not abnormal.  */
1758   edge e = single_exit (loop);
1759   if (e->flags & EDGE_ABNORMAL)
1760     return opt_result::failure_at (vect_location,
1761                                    "not vectorized:"
1762                                    " abnormal loop exit edge.\n");
1763
1764   info->loop_cond
1765     = vect_get_loop_niters (loop, &info->assumptions,
1766                             &info->number_of_iterations,
1767                             &info->number_of_iterationsm1);
1768   if (!info->loop_cond)
1769     return opt_result::failure_at
1770       (vect_location,
1771        "not vectorized: complicated exit condition.\n");
1772
1773   if (integer_zerop (info->assumptions)
1774       || !info->number_of_iterations
1775       || chrec_contains_undetermined (info->number_of_iterations))
1776     return opt_result::failure_at
1777       (info->loop_cond,
1778        "not vectorized: number of iterations cannot be computed.\n");
1779
1780   if (integer_zerop (info->number_of_iterations))
1781     return opt_result::failure_at
1782       (info->loop_cond,
1783        "not vectorized: number of iterations = 0.\n");
1784
1785   if (!(tree_fits_shwi_p (info->number_of_iterations)
1786         && tree_to_shwi (info->number_of_iterations) > 0))
1787     {
1788       if (dump_enabled_p ())
1789         {
1790           dump_printf_loc (MSG_NOTE, vect_location,
1791                            "Symbolic number of iterations is ");
1792           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1793           dump_printf (MSG_NOTE, "\n");
1794         }
1795     }
1796
1797   return opt_result::success ();
1798 }
1799
1800 /* Create a loop_vec_info for LOOP with SHARED and the
1801    vect_analyze_loop_form result.  */
1802
1803 loop_vec_info
1804 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1805                         const vect_loop_form_info *info,
1806                         loop_vec_info main_loop_info)
1807 {
1808   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1809   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1810   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1811   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1812   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1813   /* Also record the assumptions for versioning.  */
1814   if (!integer_onep (info->assumptions) && !main_loop_info)
1815     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1816
1817   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1818   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1819   if (info->inner_loop_cond)
1820     {
1821       stmt_vec_info inner_loop_cond_info
1822         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1823       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1824       /* If we have an estimate on the number of iterations of the inner
1825          loop use that to limit the scale for costing, otherwise use
1826          --param vect-inner-loop-cost-factor literally.  */
1827       widest_int nit;
1828       if (estimated_stmt_executions (loop->inner, &nit))
1829         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1830           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1831     }
1832
1833   return loop_vinfo;
1834 }
1835
1836
1837
1838 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1839    statements update the vectorization factor.  */
1840
1841 static void
1842 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1843 {
1844   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1845   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1846   int nbbs = loop->num_nodes;
1847   poly_uint64 vectorization_factor;
1848   int i;
1849
1850   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1851
1852   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1853   gcc_assert (known_ne (vectorization_factor, 0U));
1854
1855   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1856      vectorization factor of the loop is the unrolling factor required by
1857      the SLP instances.  If that unrolling factor is 1, we say, that we
1858      perform pure SLP on loop - cross iteration parallelism is not
1859      exploited.  */
1860   bool only_slp_in_loop = true;
1861   for (i = 0; i < nbbs; i++)
1862     {
1863       basic_block bb = bbs[i];
1864       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1865            gsi_next (&si))
1866         {
1867           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1868           if (!stmt_info)
1869             continue;
1870           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1871                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1872               && !PURE_SLP_STMT (stmt_info))
1873             /* STMT needs both SLP and loop-based vectorization.  */
1874             only_slp_in_loop = false;
1875         }
1876       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1877            gsi_next (&si))
1878         {
1879           if (is_gimple_debug (gsi_stmt (si)))
1880             continue;
1881           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1882           stmt_info = vect_stmt_to_vectorize (stmt_info);
1883           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1884                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1885               && !PURE_SLP_STMT (stmt_info))
1886             /* STMT needs both SLP and loop-based vectorization.  */
1887             only_slp_in_loop = false;
1888         }
1889     }
1890
1891   if (only_slp_in_loop)
1892     {
1893       if (dump_enabled_p ())
1894         dump_printf_loc (MSG_NOTE, vect_location,
1895                          "Loop contains only SLP stmts\n");
1896       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1897     }
1898   else
1899     {
1900       if (dump_enabled_p ())
1901         dump_printf_loc (MSG_NOTE, vect_location,
1902                          "Loop contains SLP and non-SLP stmts\n");
1903       /* Both the vectorization factor and unroll factor have the form
1904          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1905          so they must have a common multiple.  */
1906       vectorization_factor
1907         = force_common_multiple (vectorization_factor,
1908                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1909     }
1910
1911   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1912   if (dump_enabled_p ())
1913     {
1914       dump_printf_loc (MSG_NOTE, vect_location,
1915                        "Updating vectorization factor to ");
1916       dump_dec (MSG_NOTE, vectorization_factor);
1917       dump_printf (MSG_NOTE, ".\n");
1918     }
1919 }
1920
1921 /* Return true if STMT_INFO describes a double reduction phi and if
1922    the other phi in the reduction is also relevant for vectorization.
1923    This rejects cases such as:
1924
1925       outer1:
1926         x_1 = PHI <x_3(outer2), ...>;
1927         ...
1928
1929       inner:
1930         x_2 = ...;
1931         ...
1932
1933       outer2:
1934         x_3 = PHI <x_2(inner)>;
1935
1936    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1937
1938 static bool
1939 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1940 {
1941   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1942     return false;
1943
1944   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1945 }
1946
1947 /* Function vect_analyze_loop_operations.
1948
1949    Scan the loop stmts and make sure they are all vectorizable.  */
1950
1951 static opt_result
1952 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1953 {
1954   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1955   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1956   int nbbs = loop->num_nodes;
1957   int i;
1958   stmt_vec_info stmt_info;
1959   bool need_to_vectorize = false;
1960   bool ok;
1961
1962   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1963
1964   auto_vec<stmt_info_for_cost> cost_vec;
1965
1966   for (i = 0; i < nbbs; i++)
1967     {
1968       basic_block bb = bbs[i];
1969
1970       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1971            gsi_next (&si))
1972         {
1973           gphi *phi = si.phi ();
1974           ok = true;
1975
1976           stmt_info = loop_vinfo->lookup_stmt (phi);
1977           if (dump_enabled_p ())
1978             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
1979                              (gimple *) phi);
1980           if (virtual_operand_p (gimple_phi_result (phi)))
1981             continue;
1982
1983           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1984              (i.e., a phi in the tail of the outer-loop).  */
1985           if (! is_loop_header_bb_p (bb))
1986             {
1987               /* FORNOW: we currently don't support the case that these phis
1988                  are not used in the outerloop (unless it is double reduction,
1989                  i.e., this phi is vect_reduction_def), cause this case
1990                  requires to actually do something here.  */
1991               if (STMT_VINFO_LIVE_P (stmt_info)
1992                   && !vect_active_double_reduction_p (stmt_info))
1993                 return opt_result::failure_at (phi,
1994                                                "Unsupported loop-closed phi"
1995                                                " in outer-loop.\n");
1996
1997               /* If PHI is used in the outer loop, we check that its operand
1998                  is defined in the inner loop.  */
1999               if (STMT_VINFO_RELEVANT_P (stmt_info))
2000                 {
2001                   tree phi_op;
2002
2003                   if (gimple_phi_num_args (phi) != 1)
2004                     return opt_result::failure_at (phi, "unsupported phi");
2005
2006                   phi_op = PHI_ARG_DEF (phi, 0);
2007                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2008                   if (!op_def_info)
2009                     return opt_result::failure_at (phi, "unsupported phi\n");
2010
2011                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2012                       && (STMT_VINFO_RELEVANT (op_def_info)
2013                           != vect_used_in_outer_by_reduction))
2014                     return opt_result::failure_at (phi, "unsupported phi\n");
2015
2016                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2017                        || (STMT_VINFO_DEF_TYPE (stmt_info)
2018                            == vect_double_reduction_def))
2019                       && !vectorizable_lc_phi (loop_vinfo,
2020                                                stmt_info, NULL, NULL))
2021                     return opt_result::failure_at (phi, "unsupported phi\n");
2022                 }
2023
2024               continue;
2025             }
2026
2027           gcc_assert (stmt_info);
2028
2029           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2030                || STMT_VINFO_LIVE_P (stmt_info))
2031               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2032               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2033             /* A scalar-dependence cycle that we don't support.  */
2034             return opt_result::failure_at (phi,
2035                                            "not vectorized:"
2036                                            " scalar dependence cycle.\n");
2037
2038           if (STMT_VINFO_RELEVANT_P (stmt_info))
2039             {
2040               need_to_vectorize = true;
2041               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2042                   && ! PURE_SLP_STMT (stmt_info))
2043                 ok = vectorizable_induction (loop_vinfo,
2044                                              stmt_info, NULL, NULL,
2045                                              &cost_vec);
2046               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2047                         || (STMT_VINFO_DEF_TYPE (stmt_info)
2048                             == vect_double_reduction_def)
2049                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2050                        && ! PURE_SLP_STMT (stmt_info))
2051                 ok = vectorizable_reduction (loop_vinfo,
2052                                              stmt_info, NULL, NULL, &cost_vec);
2053               else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2054                         == vect_first_order_recurrence)
2055                        && ! PURE_SLP_STMT (stmt_info))
2056                 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2057                                            &cost_vec);
2058             }
2059
2060           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
2061           if (ok
2062               && STMT_VINFO_LIVE_P (stmt_info)
2063               && !PURE_SLP_STMT (stmt_info))
2064             ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2065                                               -1, false, &cost_vec);
2066
2067           if (!ok)
2068             return opt_result::failure_at (phi,
2069                                            "not vectorized: relevant phi not "
2070                                            "supported: %G",
2071                                            static_cast <gimple *> (phi));
2072         }
2073
2074       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2075            gsi_next (&si))
2076         {
2077           gimple *stmt = gsi_stmt (si);
2078           if (!gimple_clobber_p (stmt)
2079               && !is_gimple_debug (stmt))
2080             {
2081               opt_result res
2082                 = vect_analyze_stmt (loop_vinfo,
2083                                      loop_vinfo->lookup_stmt (stmt),
2084                                      &need_to_vectorize,
2085                                      NULL, NULL, &cost_vec);
2086               if (!res)
2087                 return res;
2088             }
2089         }
2090     } /* bbs */
2091
2092   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2093
2094   /* All operations in the loop are either irrelevant (deal with loop
2095      control, or dead), or only used outside the loop and can be moved
2096      out of the loop (e.g. invariants, inductions).  The loop can be
2097      optimized away by scalar optimizations.  We're better off not
2098      touching this loop.  */
2099   if (!need_to_vectorize)
2100     {
2101       if (dump_enabled_p ())
2102         dump_printf_loc (MSG_NOTE, vect_location,
2103                          "All the computation can be taken out of the loop.\n");
2104       return opt_result::failure_at
2105         (vect_location,
2106          "not vectorized: redundant loop. no profit to vectorize.\n");
2107     }
2108
2109   return opt_result::success ();
2110 }
2111
2112 /* Return true if we know that the iteration count is smaller than the
2113    vectorization factor.  Return false if it isn't, or if we can't be sure
2114    either way.  */
2115
2116 static bool
2117 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2118 {
2119   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2120
2121   HOST_WIDE_INT max_niter;
2122   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2123     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2124   else
2125     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2126
2127   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2128     return true;
2129
2130   return false;
2131 }
2132
2133 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
2134    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
2135    definitely no, or -1 if it's worth retrying.  */
2136
2137 static int
2138 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2139                            unsigned *suggested_unroll_factor)
2140 {
2141   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2142   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2143
2144   /* Only loops that can handle partially-populated vectors can have iteration
2145      counts less than the vectorization factor.  */
2146   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2147       && vect_known_niters_smaller_than_vf (loop_vinfo))
2148     {
2149       if (dump_enabled_p ())
2150         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2151                          "not vectorized: iteration count smaller than "
2152                          "vectorization factor.\n");
2153       return 0;
2154     }
2155
2156   /* If we know the number of iterations we can do better, for the
2157      epilogue we can also decide whether the main loop leaves us
2158      with enough iterations, prefering a smaller vector epilog then
2159      also possibly used for the case we skip the vector loop.  */
2160   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2161     {
2162       widest_int scalar_niters
2163         = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2164       if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2165         {
2166           loop_vec_info orig_loop_vinfo
2167             = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2168           unsigned lowest_vf
2169             = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2170           int prolog_peeling = 0;
2171           if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2172             prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2173           if (prolog_peeling >= 0
2174               && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2175                            lowest_vf))
2176             {
2177               unsigned gap
2178                 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2179               scalar_niters = ((scalar_niters - gap - prolog_peeling)
2180                                % lowest_vf + gap);
2181             }
2182         }
2183       /* Reject vectorizing for a single scalar iteration, even if
2184          we could in principle implement that using partial vectors.  */
2185       unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2186       if (scalar_niters <= peeling_gap + 1)
2187         {
2188           if (dump_enabled_p ())
2189             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2190                              "not vectorized: loop only has a single "
2191                              "scalar iteration.\n");
2192           return 0;
2193         }
2194
2195       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2196         {
2197           /* Check that the loop processes at least one full vector.  */
2198           poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2199           if (known_lt (scalar_niters, vf))
2200             {
2201               if (dump_enabled_p ())
2202                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2203                                  "loop does not have enough iterations "
2204                                  "to support vectorization.\n");
2205               return 0;
2206             }
2207
2208           /* If we need to peel an extra epilogue iteration to handle data
2209              accesses with gaps, check that there are enough scalar iterations
2210              available.
2211
2212              The check above is redundant with this one when peeling for gaps,
2213              but the distinction is useful for diagnostics.  */
2214           if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2215               && known_le (scalar_niters, vf))
2216             {
2217               if (dump_enabled_p ())
2218                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2219                                  "loop does not have enough iterations "
2220                                  "to support peeling for gaps.\n");
2221               return 0;
2222             }
2223         }
2224     }
2225
2226   /* If using the "very cheap" model. reject cases in which we'd keep
2227      a copy of the scalar code (even if we might be able to vectorize it).  */
2228   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2229       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2230           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2231           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2232     {
2233       if (dump_enabled_p ())
2234         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2235                          "some scalar iterations would need to be peeled\n");
2236       return 0;
2237     }
2238
2239   int min_profitable_iters, min_profitable_estimate;
2240   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2241                                       &min_profitable_estimate,
2242                                       suggested_unroll_factor);
2243
2244   if (min_profitable_iters < 0)
2245     {
2246       if (dump_enabled_p ())
2247         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2248                          "not vectorized: vectorization not profitable.\n");
2249       if (dump_enabled_p ())
2250         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2251                          "not vectorized: vector version will never be "
2252                          "profitable.\n");
2253       return -1;
2254     }
2255
2256   int min_scalar_loop_bound = (param_min_vect_loop_bound
2257                                * assumed_vf);
2258
2259   /* Use the cost model only if it is more conservative than user specified
2260      threshold.  */
2261   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2262                                     min_profitable_iters);
2263
2264   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2265
2266   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2267       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2268     {
2269       if (dump_enabled_p ())
2270         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2271                          "not vectorized: vectorization not profitable.\n");
2272       if (dump_enabled_p ())
2273         dump_printf_loc (MSG_NOTE, vect_location,
2274                          "not vectorized: iteration count smaller than user "
2275                          "specified loop bound parameter or minimum profitable "
2276                          "iterations (whichever is more conservative).\n");
2277       return 0;
2278     }
2279
2280   /* The static profitablity threshold min_profitable_estimate includes
2281      the cost of having to check at runtime whether the scalar loop
2282      should be used instead.  If it turns out that we don't need or want
2283      such a check, the threshold we should use for the static estimate
2284      is simply the point at which the vector loop becomes more profitable
2285      than the scalar loop.  */
2286   if (min_profitable_estimate > min_profitable_iters
2287       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2288       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2289       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2290       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2291     {
2292       if (dump_enabled_p ())
2293         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2294                          " choice between the scalar and vector loops\n");
2295       min_profitable_estimate = min_profitable_iters;
2296     }
2297
2298   /* If the vector loop needs multiple iterations to be beneficial then
2299      things are probably too close to call, and the conservative thing
2300      would be to stick with the scalar code.  */
2301   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2302       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2303     {
2304       if (dump_enabled_p ())
2305         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2306                          "one iteration of the vector loop would be"
2307                          " more expensive than the equivalent number of"
2308                          " iterations of the scalar loop\n");
2309       return 0;
2310     }
2311
2312   HOST_WIDE_INT estimated_niter;
2313
2314   /* If we are vectorizing an epilogue then we know the maximum number of
2315      scalar iterations it will cover is at least one lower than the
2316      vectorization factor of the main loop.  */
2317   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2318     estimated_niter
2319       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2320   else
2321     {
2322       estimated_niter = estimated_stmt_executions_int (loop);
2323       if (estimated_niter == -1)
2324         estimated_niter = likely_max_stmt_executions_int (loop);
2325     }
2326   if (estimated_niter != -1
2327       && ((unsigned HOST_WIDE_INT) estimated_niter
2328           < MAX (th, (unsigned) min_profitable_estimate)))
2329     {
2330       if (dump_enabled_p ())
2331         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2332                          "not vectorized: estimated iteration count too "
2333                          "small.\n");
2334       if (dump_enabled_p ())
2335         dump_printf_loc (MSG_NOTE, vect_location,
2336                          "not vectorized: estimated iteration count smaller "
2337                          "than specified loop bound parameter or minimum "
2338                          "profitable iterations (whichever is more "
2339                          "conservative).\n");
2340       return -1;
2341     }
2342
2343   return 1;
2344 }
2345
2346 static opt_result
2347 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2348                            vec<data_reference_p> *datarefs,
2349                            unsigned int *n_stmts)
2350 {
2351   *n_stmts = 0;
2352   for (unsigned i = 0; i < loop->num_nodes; i++)
2353     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2354          !gsi_end_p (gsi); gsi_next (&gsi))
2355       {
2356         gimple *stmt = gsi_stmt (gsi);
2357         if (is_gimple_debug (stmt))
2358           continue;
2359         ++(*n_stmts);
2360         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2361                                                         NULL, 0);
2362         if (!res)
2363           {
2364             if (is_gimple_call (stmt) && loop->safelen)
2365               {
2366                 tree fndecl = gimple_call_fndecl (stmt), op;
2367                 if (fndecl == NULL_TREE
2368                     && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2369                   {
2370                     fndecl = gimple_call_arg (stmt, 0);
2371                     gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2372                     fndecl = TREE_OPERAND (fndecl, 0);
2373                     gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2374                   }
2375                 if (fndecl != NULL_TREE)
2376                   {
2377                     cgraph_node *node = cgraph_node::get (fndecl);
2378                     if (node != NULL && node->simd_clones != NULL)
2379                       {
2380                         unsigned int j, n = gimple_call_num_args (stmt);
2381                         for (j = 0; j < n; j++)
2382                           {
2383                             op = gimple_call_arg (stmt, j);
2384                             if (DECL_P (op)
2385                                 || (REFERENCE_CLASS_P (op)
2386                                     && get_base_address (op)))
2387                               break;
2388                           }
2389                         op = gimple_call_lhs (stmt);
2390                         /* Ignore #pragma omp declare simd functions
2391                            if they don't have data references in the
2392                            call stmt itself.  */
2393                         if (j == n
2394                             && !(op
2395                                  && (DECL_P (op)
2396                                      || (REFERENCE_CLASS_P (op)
2397                                          && get_base_address (op)))))
2398                           continue;
2399                       }
2400                   }
2401               }
2402             return res;
2403           }
2404         /* If dependence analysis will give up due to the limit on the
2405            number of datarefs stop here and fail fatally.  */
2406         if (datarefs->length ()
2407             > (unsigned)param_loop_max_datarefs_for_datadeps)
2408           return opt_result::failure_at (stmt, "exceeded param "
2409                                          "loop-max-datarefs-for-datadeps\n");
2410       }
2411   return opt_result::success ();
2412 }
2413
2414 /* Look for SLP-only access groups and turn each individual access into its own
2415    group.  */
2416 static void
2417 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2418 {
2419   unsigned int i;
2420   struct data_reference *dr;
2421
2422   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2423
2424   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2425   FOR_EACH_VEC_ELT (datarefs, i, dr)
2426     {
2427       gcc_assert (DR_REF (dr));
2428       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2429
2430       /* Check if the load is a part of an interleaving chain.  */
2431       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2432         {
2433           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2434           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2435           unsigned int group_size = DR_GROUP_SIZE (first_element);
2436
2437           /* Check if SLP-only groups.  */
2438           if (!STMT_SLP_TYPE (stmt_info)
2439               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2440             {
2441               /* Dissolve the group.  */
2442               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2443
2444               stmt_vec_info vinfo = first_element;
2445               while (vinfo)
2446                 {
2447                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2448                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2449                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2450                   DR_GROUP_SIZE (vinfo) = 1;
2451                   if (STMT_VINFO_STRIDED_P (first_element))
2452                     DR_GROUP_GAP (vinfo) = 0;
2453                   else
2454                     DR_GROUP_GAP (vinfo) = group_size - 1;
2455                   /* Duplicate and adjust alignment info, it needs to
2456                      be present on each group leader, see dr_misalignment.  */
2457                   if (vinfo != first_element)
2458                     {
2459                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2460                       dr_info2->target_alignment = dr_info->target_alignment;
2461                       int misalignment = dr_info->misalignment;
2462                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2463                         {
2464                           HOST_WIDE_INT diff
2465                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2466                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2467                           unsigned HOST_WIDE_INT align_c
2468                             = dr_info->target_alignment.to_constant ();
2469                           misalignment = (misalignment + diff) % align_c;
2470                         }
2471                       dr_info2->misalignment = misalignment;
2472                     }
2473                   vinfo = next;
2474                 }
2475             }
2476         }
2477     }
2478 }
2479
2480 /* Determine if operating on full vectors for LOOP_VINFO might leave
2481    some scalar iterations still to do.  If so, decide how we should
2482    handle those scalar iterations.  The possibilities are:
2483
2484    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2485        In this case:
2486
2487          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2488          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2489          LOOP_VINFO_PEELING_FOR_NITER == false
2490
2491    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2492        to handle the remaining scalar iterations.  In this case:
2493
2494          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2495          LOOP_VINFO_PEELING_FOR_NITER == true
2496
2497        There are two choices:
2498
2499        (2a) Consider vectorizing the epilogue loop at the same VF as the
2500             main loop, but using partial vectors instead of full vectors.
2501             In this case:
2502
2503               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2504
2505        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2506             In this case:
2507
2508               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2509  */
2510
2511 opt_result
2512 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2513 {
2514   /* Determine whether there would be any scalar iterations left over.  */
2515   bool need_peeling_or_partial_vectors_p
2516     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2517
2518   /* Decide whether to vectorize the loop with partial vectors.  */
2519   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2520   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2521   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2522       && need_peeling_or_partial_vectors_p)
2523     {
2524       /* For partial-vector-usage=1, try to push the handling of partial
2525          vectors to the epilogue, with the main loop continuing to operate
2526          on full vectors.
2527
2528          If we are unrolling we also do not want to use partial vectors. This
2529          is to avoid the overhead of generating multiple masks and also to
2530          avoid having to execute entire iterations of FALSE masked instructions
2531          when dealing with one or less full iterations.
2532
2533          ??? We could then end up failing to use partial vectors if we
2534          decide to peel iterations into a prologue, and if the main loop
2535          then ends up processing fewer than VF iterations.  */
2536       if ((param_vect_partial_vector_usage == 1
2537            || loop_vinfo->suggested_unroll_factor > 1)
2538           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2539           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2540         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2541       else
2542         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2543     }
2544
2545   if (dump_enabled_p ())
2546     dump_printf_loc (MSG_NOTE, vect_location,
2547                      "operating on %s vectors%s.\n",
2548                      LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2549                      ? "partial" : "full",
2550                      LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2551                      ? " for epilogue loop" : "");
2552
2553   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2554     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2555        && need_peeling_or_partial_vectors_p);
2556
2557   return opt_result::success ();
2558 }
2559
2560 /* Function vect_analyze_loop_2.
2561
2562    Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2563    analyses will record information in some members of LOOP_VINFO.  FATAL
2564    indicates if some analysis meets fatal error.  If one non-NULL pointer
2565    SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2566    worked out suggested unroll factor, while one NULL pointer shows it's
2567    going to apply the suggested unroll factor.  SLP_DONE_FOR_SUGGESTED_UF
2568    is to hold the slp decision when the suggested unroll factor is worked
2569    out.  */
2570 static opt_result
2571 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2572                      unsigned *suggested_unroll_factor,
2573                      bool& slp_done_for_suggested_uf)
2574 {
2575   opt_result ok = opt_result::success ();
2576   int res;
2577   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2578   poly_uint64 min_vf = 2;
2579   loop_vec_info orig_loop_vinfo = NULL;
2580
2581   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2582      loop_vec_info of the first vectorized loop.  */
2583   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2584     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2585   else
2586     orig_loop_vinfo = loop_vinfo;
2587   gcc_assert (orig_loop_vinfo);
2588
2589   /* The first group of checks is independent of the vector size.  */
2590   fatal = true;
2591
2592   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2593       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2594     return opt_result::failure_at (vect_location,
2595                                    "not vectorized: simd if(0)\n");
2596
2597   /* Find all data references in the loop (which correspond to vdefs/vuses)
2598      and analyze their evolution in the loop.  */
2599
2600   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2601
2602   /* Gather the data references and count stmts in the loop.  */
2603   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2604     {
2605       opt_result res
2606         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2607                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2608                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2609       if (!res)
2610         {
2611           if (dump_enabled_p ())
2612             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2613                              "not vectorized: loop contains function "
2614                              "calls or data references that cannot "
2615                              "be analyzed\n");
2616           return res;
2617         }
2618       loop_vinfo->shared->save_datarefs ();
2619     }
2620   else
2621     loop_vinfo->shared->check_datarefs ();
2622
2623   /* Analyze the data references and also adjust the minimal
2624      vectorization factor according to the loads and stores.  */
2625
2626   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2627   if (!ok)
2628     {
2629       if (dump_enabled_p ())
2630         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2631                          "bad data references.\n");
2632       return ok;
2633     }
2634
2635   /* Check if we are applying unroll factor now.  */
2636   bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2637   gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2638
2639   /* If the slp decision is false when suggested unroll factor is worked
2640      out, and we are applying suggested unroll factor, we can simply skip
2641      all slp related analyses this time.  */
2642   bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2643
2644   /* Classify all cross-iteration scalar data-flow cycles.
2645      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2646   vect_analyze_scalar_cycles (loop_vinfo, slp);
2647
2648   vect_pattern_recog (loop_vinfo);
2649
2650   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2651
2652   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2653      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2654
2655   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2656   if (!ok)
2657     {
2658       if (dump_enabled_p ())
2659         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2660                          "bad data access.\n");
2661       return ok;
2662     }
2663
2664   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2665
2666   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2667   if (!ok)
2668     {
2669       if (dump_enabled_p ())
2670         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2671                          "unexpected pattern.\n");
2672       return ok;
2673     }
2674
2675   /* While the rest of the analysis below depends on it in some way.  */
2676   fatal = false;
2677
2678   /* Analyze data dependences between the data-refs in the loop
2679      and adjust the maximum vectorization factor according to
2680      the dependences.
2681      FORNOW: fail at the first data dependence that we encounter.  */
2682
2683   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2684   if (!ok)
2685     {
2686       if (dump_enabled_p ())
2687         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2688                          "bad data dependence.\n");
2689       return ok;
2690     }
2691   if (max_vf != MAX_VECTORIZATION_FACTOR
2692       && maybe_lt (max_vf, min_vf))
2693     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2694   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2695
2696   ok = vect_determine_vectorization_factor (loop_vinfo);
2697   if (!ok)
2698     {
2699       if (dump_enabled_p ())
2700         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2701                          "can't determine vectorization factor.\n");
2702       return ok;
2703     }
2704   if (max_vf != MAX_VECTORIZATION_FACTOR
2705       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2706     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2707
2708   /* Compute the scalar iteration cost.  */
2709   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2710
2711   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2712
2713   if (slp)
2714     {
2715       /* Check the SLP opportunities in the loop, analyze and build
2716          SLP trees.  */
2717       ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2718       if (!ok)
2719         return ok;
2720
2721       /* If there are any SLP instances mark them as pure_slp.  */
2722       slp = vect_make_slp_decision (loop_vinfo);
2723       if (slp)
2724         {
2725           /* Find stmts that need to be both vectorized and SLPed.  */
2726           vect_detect_hybrid_slp (loop_vinfo);
2727
2728           /* Update the vectorization factor based on the SLP decision.  */
2729           vect_update_vf_for_slp (loop_vinfo);
2730
2731           /* Optimize the SLP graph with the vectorization factor fixed.  */
2732           vect_optimize_slp (loop_vinfo);
2733
2734           /* Gather the loads reachable from the SLP graph entries.  */
2735           vect_gather_slp_loads (loop_vinfo);
2736         }
2737     }
2738
2739   bool saved_can_use_partial_vectors_p
2740     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2741
2742   /* We don't expect to have to roll back to anything other than an empty
2743      set of rgroups.  */
2744   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2745
2746   /* This is the point where we can re-start analysis with SLP forced off.  */
2747 start_over:
2748
2749   /* Apply the suggested unrolling factor, this was determined by the backend
2750      during finish_cost the first time we ran the analyzis for this
2751      vector mode.  */
2752   if (applying_suggested_uf)
2753     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2754
2755   /* Now the vectorization factor is final.  */
2756   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2757   gcc_assert (known_ne (vectorization_factor, 0U));
2758
2759   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2760     {
2761       dump_printf_loc (MSG_NOTE, vect_location,
2762                        "vectorization_factor = ");
2763       dump_dec (MSG_NOTE, vectorization_factor);
2764       dump_printf (MSG_NOTE, ", niters = %wd\n",
2765                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2766     }
2767
2768   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2769
2770   /* Analyze the alignment of the data-refs in the loop.
2771      Fail if a data reference is found that cannot be vectorized.  */
2772
2773   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2774   if (!ok)
2775     {
2776       if (dump_enabled_p ())
2777         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2778                          "bad data alignment.\n");
2779       return ok;
2780     }
2781
2782   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2783      It is important to call pruning after vect_analyze_data_ref_accesses,
2784      since we use grouping information gathered by interleaving analysis.  */
2785   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2786   if (!ok)
2787     return ok;
2788
2789   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2790      vectorization, since we do not want to add extra peeling or
2791      add versioning for alignment.  */
2792   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2793     /* This pass will decide on using loop versioning and/or loop peeling in
2794        order to enhance the alignment of data references in the loop.  */
2795     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2796   if (!ok)
2797     return ok;
2798
2799   if (slp)
2800     {
2801       /* Analyze operations in the SLP instances.  Note this may
2802          remove unsupported SLP instances which makes the above
2803          SLP kind detection invalid.  */
2804       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2805       vect_slp_analyze_operations (loop_vinfo);
2806       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2807         {
2808           ok = opt_result::failure_at (vect_location,
2809                                        "unsupported SLP instances\n");
2810           goto again;
2811         }
2812
2813       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2814       slp_tree load_node, slp_root;
2815       unsigned i, x;
2816       slp_instance instance;
2817       bool can_use_lanes = true;
2818       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2819         {
2820           slp_root = SLP_INSTANCE_TREE (instance);
2821           int group_size = SLP_TREE_LANES (slp_root);
2822           tree vectype = SLP_TREE_VECTYPE (slp_root);
2823           bool loads_permuted = false;
2824           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2825             {
2826               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2827                 continue;
2828               unsigned j;
2829               stmt_vec_info load_info;
2830               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2831                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2832                   {
2833                     loads_permuted = true;
2834                     break;
2835                   }
2836             }
2837
2838           /* If the loads and stores can be handled with load/store-lane
2839              instructions record it and move on to the next instance.  */
2840           if (loads_permuted
2841               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2842               && vect_store_lanes_supported (vectype, group_size, false)
2843                    != IFN_LAST)
2844             {
2845               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2846                 {
2847                   stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2848                       (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2849                   /* Use SLP for strided accesses (or if we can't
2850                      load-lanes).  */
2851                   if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2852                       || vect_load_lanes_supported
2853                             (STMT_VINFO_VECTYPE (stmt_vinfo),
2854                              DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
2855                     break;
2856                 }
2857
2858               can_use_lanes
2859                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2860
2861               if (can_use_lanes && dump_enabled_p ())
2862                 dump_printf_loc (MSG_NOTE, vect_location,
2863                                  "SLP instance %p can use load/store-lanes\n",
2864                                  (void *) instance);
2865             }
2866           else
2867             {
2868               can_use_lanes = false;
2869               break;
2870             }
2871         }
2872
2873       /* If all SLP instances can use load/store-lanes abort SLP and try again
2874          with SLP disabled.  */
2875       if (can_use_lanes)
2876         {
2877           ok = opt_result::failure_at (vect_location,
2878                                        "Built SLP cancelled: can use "
2879                                        "load/store-lanes\n");
2880           if (dump_enabled_p ())
2881             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2882                              "Built SLP cancelled: all SLP instances support "
2883                              "load/store-lanes\n");
2884           goto again;
2885         }
2886     }
2887
2888   /* Dissolve SLP-only groups.  */
2889   vect_dissolve_slp_only_groups (loop_vinfo);
2890
2891   /* Scan all the remaining operations in the loop that are not subject
2892      to SLP and make sure they are vectorizable.  */
2893   ok = vect_analyze_loop_operations (loop_vinfo);
2894   if (!ok)
2895     {
2896       if (dump_enabled_p ())
2897         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2898                          "bad operation or unsupported loop bound.\n");
2899       return ok;
2900     }
2901
2902   /* For now, we don't expect to mix both masking and length approaches for one
2903      loop, disable it if both are recorded.  */
2904   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2905       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2906       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2907     {
2908       if (dump_enabled_p ())
2909         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2910                          "can't vectorize a loop with partial vectors"
2911                          " because we don't expect to mix different"
2912                          " approaches with partial vectors for the"
2913                          " same loop.\n");
2914       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2915     }
2916
2917   /* If we still have the option of using partial vectors,
2918      check whether we can generate the necessary loop controls.  */
2919   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2920     {
2921       if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
2922         {
2923           if (!vect_verify_full_masking (loop_vinfo)
2924               && !vect_verify_full_masking_avx512 (loop_vinfo))
2925             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2926         }
2927       else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
2928         if (!vect_verify_loop_lens (loop_vinfo))
2929           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2930     }
2931
2932   /* If we're vectorizing a loop that uses length "controls" and
2933      can iterate more than once, we apply decrementing IV approach
2934      in loop control.  */
2935   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2936       && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
2937       && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
2938       && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2939            && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
2940                         LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
2941     LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
2942
2943   /* If a loop uses length controls and has a decrementing loop control IV,
2944      we will normally pass that IV through a MIN_EXPR to calcaluate the
2945      basis for the length controls.  E.g. in a loop that processes one
2946      element per scalar iteration, the number of elements would be
2947      MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
2948
2949      This MIN_EXPR approach allows us to use pointer IVs with an invariant
2950      step, since only the final iteration of the vector loop can have
2951      inactive lanes.
2952
2953      However, some targets have a dedicated instruction for calculating the
2954      preferred length, given the total number of elements that still need to
2955      be processed.  This is encapsulated in the SELECT_VL internal function.
2956
2957      If the target supports SELECT_VL, we can use it instead of MIN_EXPR
2958      to determine the basis for the length controls.  However, unlike the
2959      MIN_EXPR calculation, the SELECT_VL calculation can decide to make
2960      lanes inactive in any iteration of the vector loop, not just the last
2961      iteration.  This SELECT_VL approach therefore requires us to use pointer
2962      IVs with variable steps.
2963
2964      Once we've decided how many elements should be processed by one
2965      iteration of the vector loop, we need to populate the rgroup controls.
2966      If a loop has multiple rgroups, we need to make sure that those rgroups
2967      "line up" (that is, they must be consistent about which elements are
2968      active and which aren't).  This is done by vect_adjust_loop_lens_control.
2969
2970      In principle, it would be possible to use vect_adjust_loop_lens_control
2971      on either the result of a MIN_EXPR or the result of a SELECT_VL.
2972      However:
2973
2974      (1) In practice, it only makes sense to use SELECT_VL when a vector
2975          operation will be controlled directly by the result.  It is not
2976          worth using SELECT_VL if it would only be the input to other
2977          calculations.
2978
2979      (2) If we use SELECT_VL for an rgroup that has N controls, each associated
2980          pointer IV will need N updates by a variable amount (N-1 updates
2981          within the iteration and 1 update to move to the next iteration).
2982
2983      Because of this, we prefer to use the MIN_EXPR approach whenever there
2984      is more than one length control.
2985
2986      In addition, SELECT_VL always operates to a granularity of 1 unit.
2987      If we wanted to use it to control an SLP operation on N consecutive
2988      elements, we would need to make the SELECT_VL inputs measure scalar
2989      iterations (rather than elements) and then multiply the SELECT_VL
2990      result by N.  But using SELECT_VL this way is inefficient because
2991      of (1) above.
2992
2993      2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
2994         satisfied:
2995
2996      (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
2997      (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
2998
2999      Since SELECT_VL (variable step) will make SCEV analysis failed and then
3000      we will fail to gain benefits of following unroll optimizations. We prefer
3001      using the MIN_EXPR approach in this situation.  */
3002   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3003     {
3004       tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3005       if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3006                                           OPTIMIZE_FOR_SPEED)
3007           && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3008           && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3009           && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3010               || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3011         LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3012     }
3013
3014   /* Decide whether this loop_vinfo should use partial vectors or peeling,
3015      assuming that the loop will be used as a main loop.  We will redo
3016      this analysis later if we instead decide to use the loop as an
3017      epilogue loop.  */
3018   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3019   if (!ok)
3020     return ok;
3021
3022   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3023      to be able to handle fewer than VF scalars, or needs to have a lower VF
3024      than the main loop.  */
3025   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3026       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3027     {
3028       poly_uint64 unscaled_vf
3029         = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3030                      orig_loop_vinfo->suggested_unroll_factor);
3031       if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3032         return opt_result::failure_at (vect_location,
3033                                        "Vectorization factor too high for"
3034                                        " epilogue loop.\n");
3035     }
3036
3037   /* Check the costings of the loop make vectorizing worthwhile.  */
3038   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3039   if (res < 0)
3040     {
3041       ok = opt_result::failure_at (vect_location,
3042                                    "Loop costings may not be worthwhile.\n");
3043       goto again;
3044     }
3045   if (!res)
3046     return opt_result::failure_at (vect_location,
3047                                    "Loop costings not worthwhile.\n");
3048
3049   /* If an epilogue loop is required make sure we can create one.  */
3050   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3051       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3052     {
3053       if (dump_enabled_p ())
3054         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3055       if (!vect_can_advance_ivs_p (loop_vinfo)
3056           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
3057                                            single_exit (LOOP_VINFO_LOOP
3058                                                          (loop_vinfo))))
3059         {
3060           ok = opt_result::failure_at (vect_location,
3061                                        "not vectorized: can't create required "
3062                                        "epilog loop\n");
3063           goto again;
3064         }
3065     }
3066
3067   /* During peeling, we need to check if number of loop iterations is
3068      enough for both peeled prolog loop and vector loop.  This check
3069      can be merged along with threshold check of loop versioning, so
3070      increase threshold for this case if necessary.
3071
3072      If we are analyzing an epilogue we still want to check what its
3073      versioning threshold would be.  If we decide to vectorize the epilogues we
3074      will want to use the lowest versioning threshold of all epilogues and main
3075      loop.  This will enable us to enter a vectorized epilogue even when
3076      versioning the loop.  We can't simply check whether the epilogue requires
3077      versioning though since we may have skipped some versioning checks when
3078      analyzing the epilogue.  For instance, checks for alias versioning will be
3079      skipped when dealing with epilogues as we assume we already checked them
3080      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
3081   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3082     {
3083       poly_uint64 niters_th = 0;
3084       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3085
3086       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3087         {
3088           /* Niters for peeled prolog loop.  */
3089           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3090             {
3091               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3092               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3093               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3094             }
3095           else
3096             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3097         }
3098
3099       /* Niters for at least one iteration of vectorized loop.  */
3100       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3101         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3102       /* One additional iteration because of peeling for gap.  */
3103       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3104         niters_th += 1;
3105
3106       /*  Use the same condition as vect_transform_loop to decide when to use
3107           the cost to determine a versioning threshold.  */
3108       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3109           && ordered_p (th, niters_th))
3110         niters_th = ordered_max (poly_uint64 (th), niters_th);
3111
3112       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3113     }
3114
3115   gcc_assert (known_eq (vectorization_factor,
3116                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3117
3118   slp_done_for_suggested_uf = slp;
3119
3120   /* Ok to vectorize!  */
3121   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3122   return opt_result::success ();
3123
3124 again:
3125   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
3126   gcc_assert (!ok);
3127
3128   /* Try again with SLP forced off but if we didn't do any SLP there is
3129      no point in re-trying.  */
3130   if (!slp)
3131     return ok;
3132
3133   /* If the slp decision is true when suggested unroll factor is worked
3134      out, and we are applying suggested unroll factor, we don't need to
3135      re-try any more.  */
3136   if (applying_suggested_uf && slp_done_for_suggested_uf)
3137     return ok;
3138
3139   /* If there are reduction chains re-trying will fail anyway.  */
3140   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3141     return ok;
3142
3143   /* Likewise if the grouped loads or stores in the SLP cannot be handled
3144      via interleaving or lane instructions.  */
3145   slp_instance instance;
3146   slp_tree node;
3147   unsigned i, j;
3148   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3149     {
3150       stmt_vec_info vinfo;
3151       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3152       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3153         continue;
3154       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3155       unsigned int size = DR_GROUP_SIZE (vinfo);
3156       tree vectype = STMT_VINFO_VECTYPE (vinfo);
3157       if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3158          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3159          && ! vect_grouped_store_supported (vectype, size))
3160         return opt_result::failure_at (vinfo->stmt,
3161                                        "unsupported grouped store\n");
3162       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3163         {
3164           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
3165           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3166           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3167           size = DR_GROUP_SIZE (vinfo);
3168           vectype = STMT_VINFO_VECTYPE (vinfo);
3169           if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3170               && ! vect_grouped_load_supported (vectype, single_element_p,
3171                                                 size))
3172             return opt_result::failure_at (vinfo->stmt,
3173                                            "unsupported grouped load\n");
3174         }
3175     }
3176
3177   if (dump_enabled_p ())
3178     dump_printf_loc (MSG_NOTE, vect_location,
3179                      "re-trying with SLP disabled\n");
3180
3181   /* Roll back state appropriately.  No SLP this time.  */
3182   slp = false;
3183   /* Restore vectorization factor as it were without SLP.  */
3184   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3185   /* Free the SLP instances.  */
3186   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3187     vect_free_slp_instance (instance);
3188   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3189   /* Reset SLP type to loop_vect on all stmts.  */
3190   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3191     {
3192       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3193       for (gimple_stmt_iterator si = gsi_start_phis (bb);
3194            !gsi_end_p (si); gsi_next (&si))
3195         {
3196           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3197           STMT_SLP_TYPE (stmt_info) = loop_vect;
3198           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3199               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3200             {
3201               /* vectorizable_reduction adjusts reduction stmt def-types,
3202                  restore them to that of the PHI.  */
3203               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3204                 = STMT_VINFO_DEF_TYPE (stmt_info);
3205               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3206                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
3207                 = STMT_VINFO_DEF_TYPE (stmt_info);
3208             }
3209         }
3210       for (gimple_stmt_iterator si = gsi_start_bb (bb);
3211            !gsi_end_p (si); gsi_next (&si))
3212         {
3213           if (is_gimple_debug (gsi_stmt (si)))
3214             continue;
3215           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3216           STMT_SLP_TYPE (stmt_info) = loop_vect;
3217           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3218             {
3219               stmt_vec_info pattern_stmt_info
3220                 = STMT_VINFO_RELATED_STMT (stmt_info);
3221               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3222                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3223
3224               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3225               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3226               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3227                    !gsi_end_p (pi); gsi_next (&pi))
3228                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3229                   = loop_vect;
3230             }
3231         }
3232     }
3233   /* Free optimized alias test DDRS.  */
3234   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3235   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3236   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3237   /* Reset target cost data.  */
3238   delete loop_vinfo->vector_costs;
3239   loop_vinfo->vector_costs = nullptr;
3240   /* Reset accumulated rgroup information.  */
3241   LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3242   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3243   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3244   /* Reset assorted flags.  */
3245   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3246   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3247   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3248   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3249   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3250     = saved_can_use_partial_vectors_p;
3251
3252   goto start_over;
3253 }
3254
3255 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3256    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
3257    OLD_LOOP_VINFO is better unless something specifically indicates
3258    otherwise.
3259
3260    Note that this deliberately isn't a partial order.  */
3261
3262 static bool
3263 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3264                           loop_vec_info old_loop_vinfo)
3265 {
3266   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3267   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3268
3269   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3270   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3271
3272   /* Always prefer a VF of loop->simdlen over any other VF.  */
3273   if (loop->simdlen)
3274     {
3275       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3276       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3277       if (new_simdlen_p != old_simdlen_p)
3278         return new_simdlen_p;
3279     }
3280
3281   const auto *old_costs = old_loop_vinfo->vector_costs;
3282   const auto *new_costs = new_loop_vinfo->vector_costs;
3283   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3284     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3285
3286   return new_costs->better_main_loop_than_p (old_costs);
3287 }
3288
3289 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
3290    true if we should.  */
3291
3292 static bool
3293 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3294                         loop_vec_info old_loop_vinfo)
3295 {
3296   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3297     return false;
3298
3299   if (dump_enabled_p ())
3300     dump_printf_loc (MSG_NOTE, vect_location,
3301                      "***** Preferring vector mode %s to vector mode %s\n",
3302                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
3303                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
3304   return true;
3305 }
3306
3307 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3308    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3309    MODE_I to the next mode useful to analyze.
3310    Return the loop_vinfo on success and wrapped null on failure.  */
3311
3312 static opt_loop_vec_info
3313 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3314                      const vect_loop_form_info *loop_form_info,
3315                      loop_vec_info main_loop_vinfo,
3316                      const vector_modes &vector_modes, unsigned &mode_i,
3317                      machine_mode &autodetected_vector_mode,
3318                      bool &fatal)
3319 {
3320   loop_vec_info loop_vinfo
3321     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3322
3323   machine_mode vector_mode = vector_modes[mode_i];
3324   loop_vinfo->vector_mode = vector_mode;
3325   unsigned int suggested_unroll_factor = 1;
3326   bool slp_done_for_suggested_uf = false;
3327
3328   /* Run the main analysis.  */
3329   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3330                                         &suggested_unroll_factor,
3331                                         slp_done_for_suggested_uf);
3332   if (dump_enabled_p ())
3333     dump_printf_loc (MSG_NOTE, vect_location,
3334                      "***** Analysis %s with vector mode %s\n",
3335                      res ? "succeeded" : " failed",
3336                      GET_MODE_NAME (loop_vinfo->vector_mode));
3337
3338   if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3339     {
3340       if (dump_enabled_p ())
3341         dump_printf_loc (MSG_NOTE, vect_location,
3342                          "***** Re-trying analysis for unrolling"
3343                          " with unroll factor %d and slp %s.\n",
3344                          suggested_unroll_factor,
3345                          slp_done_for_suggested_uf ? "on" : "off");
3346       loop_vec_info unroll_vinfo
3347         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3348       unroll_vinfo->vector_mode = vector_mode;
3349       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3350       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3351                                                 slp_done_for_suggested_uf);
3352       if (new_res)
3353         {
3354           delete loop_vinfo;
3355           loop_vinfo = unroll_vinfo;
3356         }
3357       else
3358         delete unroll_vinfo;
3359     }
3360
3361   /* Remember the autodetected vector mode.  */
3362   if (vector_mode == VOIDmode)
3363     autodetected_vector_mode = loop_vinfo->vector_mode;
3364
3365   /* Advance mode_i, first skipping modes that would result in the
3366      same analysis result.  */
3367   while (mode_i + 1 < vector_modes.length ()
3368          && vect_chooses_same_modes_p (loop_vinfo,
3369                                        vector_modes[mode_i + 1]))
3370     {
3371       if (dump_enabled_p ())
3372         dump_printf_loc (MSG_NOTE, vect_location,
3373                          "***** The result for vector mode %s would"
3374                          " be the same\n",
3375                          GET_MODE_NAME (vector_modes[mode_i + 1]));
3376       mode_i += 1;
3377     }
3378   if (mode_i + 1 < vector_modes.length ()
3379       && VECTOR_MODE_P (autodetected_vector_mode)
3380       && (related_vector_mode (vector_modes[mode_i + 1],
3381                                GET_MODE_INNER (autodetected_vector_mode))
3382           == autodetected_vector_mode)
3383       && (related_vector_mode (autodetected_vector_mode,
3384                                GET_MODE_INNER (vector_modes[mode_i + 1]))
3385           == vector_modes[mode_i + 1]))
3386     {
3387       if (dump_enabled_p ())
3388         dump_printf_loc (MSG_NOTE, vect_location,
3389                          "***** Skipping vector mode %s, which would"
3390                          " repeat the analysis for %s\n",
3391                          GET_MODE_NAME (vector_modes[mode_i + 1]),
3392                          GET_MODE_NAME (autodetected_vector_mode));
3393       mode_i += 1;
3394     }
3395   mode_i++;
3396
3397   if (!res)
3398     {
3399       delete loop_vinfo;
3400       if (fatal)
3401         gcc_checking_assert (main_loop_vinfo == NULL);
3402       return opt_loop_vec_info::propagate_failure (res);
3403     }
3404
3405   return opt_loop_vec_info::success (loop_vinfo);
3406 }
3407
3408 /* Function vect_analyze_loop.
3409
3410    Apply a set of analyses on LOOP, and create a loop_vec_info struct
3411    for it.  The different analyses will record information in the
3412    loop_vec_info struct.  */
3413 opt_loop_vec_info
3414 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3415 {
3416   DUMP_VECT_SCOPE ("analyze_loop_nest");
3417
3418   if (loop_outer (loop)
3419       && loop_vec_info_for_loop (loop_outer (loop))
3420       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3421     return opt_loop_vec_info::failure_at (vect_location,
3422                                           "outer-loop already vectorized.\n");
3423
3424   if (!find_loop_nest (loop, &shared->loop_nest))
3425     return opt_loop_vec_info::failure_at
3426       (vect_location,
3427        "not vectorized: loop nest containing two or more consecutive inner"
3428        " loops cannot be vectorized\n");
3429
3430   /* Analyze the loop form.  */
3431   vect_loop_form_info loop_form_info;
3432   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3433   if (!res)
3434     {
3435       if (dump_enabled_p ())
3436         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3437                          "bad loop form.\n");
3438       return opt_loop_vec_info::propagate_failure (res);
3439     }
3440   if (!integer_onep (loop_form_info.assumptions))
3441     {
3442       /* We consider to vectorize this loop by versioning it under
3443          some assumptions.  In order to do this, we need to clear
3444          existing information computed by scev and niter analyzer.  */
3445       scev_reset_htab ();
3446       free_numbers_of_iterations_estimates (loop);
3447       /* Also set flag for this loop so that following scev and niter
3448          analysis are done under the assumptions.  */
3449       loop_constraint_set (loop, LOOP_C_FINITE);
3450     }
3451
3452   auto_vector_modes vector_modes;
3453   /* Autodetect first vector size we try.  */
3454   vector_modes.safe_push (VOIDmode);
3455   unsigned int autovec_flags
3456     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3457                                                     loop->simdlen != 0);
3458   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3459                              && !unlimited_cost_model (loop));
3460   machine_mode autodetected_vector_mode = VOIDmode;
3461   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3462   unsigned int mode_i = 0;
3463   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3464
3465   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3466      a mode has not been analyzed.  */
3467   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3468   for (unsigned i = 0; i < vector_modes.length (); ++i)
3469     cached_vf_per_mode.safe_push (0);
3470
3471   /* First determine the main loop vectorization mode, either the first
3472      one that works, starting with auto-detecting the vector mode and then
3473      following the targets order of preference, or the one with the
3474      lowest cost if pick_lowest_cost_p.  */
3475   while (1)
3476     {
3477       bool fatal;
3478       unsigned int last_mode_i = mode_i;
3479       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3480          failed.  */
3481       cached_vf_per_mode[last_mode_i] = -1;
3482       opt_loop_vec_info loop_vinfo
3483         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3484                                NULL, vector_modes, mode_i,
3485                                autodetected_vector_mode, fatal);
3486       if (fatal)
3487         break;
3488
3489       if (loop_vinfo)
3490         {
3491           /*  Analyzis has been successful so update the VF value.  The
3492               VF should always be a multiple of unroll_factor and we want to
3493               capture the original VF here.  */
3494           cached_vf_per_mode[last_mode_i]
3495             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3496                          loop_vinfo->suggested_unroll_factor);
3497           /* Once we hit the desired simdlen for the first time,
3498              discard any previous attempts.  */
3499           if (simdlen
3500               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3501             {
3502               delete first_loop_vinfo;
3503               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3504               simdlen = 0;
3505             }
3506           else if (pick_lowest_cost_p
3507                    && first_loop_vinfo
3508                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3509             {
3510               /* Pick loop_vinfo over first_loop_vinfo.  */
3511               delete first_loop_vinfo;
3512               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3513             }
3514           if (first_loop_vinfo == NULL)
3515             first_loop_vinfo = loop_vinfo;
3516           else
3517             {
3518               delete loop_vinfo;
3519               loop_vinfo = opt_loop_vec_info::success (NULL);
3520             }
3521
3522           /* Commit to first_loop_vinfo if we have no reason to try
3523              alternatives.  */
3524           if (!simdlen && !pick_lowest_cost_p)
3525             break;
3526         }
3527       if (mode_i == vector_modes.length ()
3528           || autodetected_vector_mode == VOIDmode)
3529         break;
3530
3531       /* Try the next biggest vector size.  */
3532       if (dump_enabled_p ())
3533         dump_printf_loc (MSG_NOTE, vect_location,
3534                          "***** Re-trying analysis with vector mode %s\n",
3535                          GET_MODE_NAME (vector_modes[mode_i]));
3536     }
3537   if (!first_loop_vinfo)
3538     return opt_loop_vec_info::propagate_failure (res);
3539
3540   if (dump_enabled_p ())
3541     dump_printf_loc (MSG_NOTE, vect_location,
3542                      "***** Choosing vector mode %s\n",
3543                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3544
3545   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3546      enabled, SIMDUID is not set, it is the innermost loop and we have
3547      either already found the loop's SIMDLEN or there was no SIMDLEN to
3548      begin with.
3549      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3550   bool vect_epilogues = (!simdlen
3551                          && loop->inner == NULL
3552                          && param_vect_epilogues_nomask
3553                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3554                          && !loop->simduid);
3555   if (!vect_epilogues)
3556     return first_loop_vinfo;
3557
3558   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3559   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3560
3561   /* For epilogues start the analysis from the first mode.  The motivation
3562      behind starting from the beginning comes from cases where the VECTOR_MODES
3563      array may contain length-agnostic and length-specific modes.  Their
3564      ordering is not guaranteed, so we could end up picking a mode for the main
3565      loop that is after the epilogue's optimal mode.  */
3566   vector_modes[0] = autodetected_vector_mode;
3567   mode_i = 0;
3568
3569   bool supports_partial_vectors =
3570     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3571   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3572
3573   while (1)
3574     {
3575       /* If the target does not support partial vectors we can shorten the
3576          number of modes to analyze for the epilogue as we know we can't pick a
3577          mode that would lead to a VF at least as big as the
3578          FIRST_VINFO_VF.  */
3579       if (!supports_partial_vectors
3580           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3581         {
3582           mode_i++;
3583           if (mode_i == vector_modes.length ())
3584             break;
3585           continue;
3586         }
3587
3588       if (dump_enabled_p ())
3589         dump_printf_loc (MSG_NOTE, vect_location,
3590                          "***** Re-trying epilogue analysis with vector "
3591                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3592
3593       bool fatal;
3594       opt_loop_vec_info loop_vinfo
3595         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3596                                first_loop_vinfo,
3597                                vector_modes, mode_i,
3598                                autodetected_vector_mode, fatal);
3599       if (fatal)
3600         break;
3601
3602       if (loop_vinfo)
3603         {
3604           if (pick_lowest_cost_p)
3605             {
3606               /* Keep trying to roll back vectorization attempts while the
3607                  loop_vec_infos they produced were worse than this one.  */
3608               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3609               while (!vinfos.is_empty ()
3610                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3611                 {
3612                   gcc_assert (vect_epilogues);
3613                   delete vinfos.pop ();
3614                 }
3615             }
3616           /* For now only allow one epilogue loop.  */
3617           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3618             {
3619               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3620               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3621               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3622                           || maybe_ne (lowest_th, 0U));
3623               /* Keep track of the known smallest versioning
3624                  threshold.  */
3625               if (ordered_p (lowest_th, th))
3626                 lowest_th = ordered_min (lowest_th, th);
3627             }
3628           else
3629             {
3630               delete loop_vinfo;
3631               loop_vinfo = opt_loop_vec_info::success (NULL);
3632             }
3633
3634           /* For now only allow one epilogue loop, but allow
3635              pick_lowest_cost_p to replace it, so commit to the
3636              first epilogue if we have no reason to try alternatives.  */
3637           if (!pick_lowest_cost_p)
3638             break;
3639         }
3640
3641       if (mode_i == vector_modes.length ())
3642         break;
3643
3644     }
3645
3646   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3647     {
3648       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3649       if (dump_enabled_p ())
3650         dump_printf_loc (MSG_NOTE, vect_location,
3651                          "***** Choosing epilogue vector mode %s\n",
3652                          GET_MODE_NAME
3653                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3654     }
3655
3656   return first_loop_vinfo;
3657 }
3658
3659 /* Return true if there is an in-order reduction function for CODE, storing
3660    it in *REDUC_FN if so.  */
3661
3662 static bool
3663 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3664 {
3665   if (code == PLUS_EXPR)
3666     {
3667       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3668       return true;
3669     }
3670   return false;
3671 }
3672
3673 /* Function reduction_fn_for_scalar_code
3674
3675    Input:
3676    CODE - tree_code of a reduction operations.
3677
3678    Output:
3679    REDUC_FN - the corresponding internal function to be used to reduce the
3680       vector of partial results into a single scalar result, or IFN_LAST
3681       if the operation is a supported reduction operation, but does not have
3682       such an internal function.
3683
3684    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3685
3686 bool
3687 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3688 {
3689   if (code.is_tree_code ())
3690     switch (tree_code (code))
3691       {
3692       case MAX_EXPR:
3693         *reduc_fn = IFN_REDUC_MAX;
3694         return true;
3695
3696       case MIN_EXPR:
3697         *reduc_fn = IFN_REDUC_MIN;
3698         return true;
3699
3700       case PLUS_EXPR:
3701         *reduc_fn = IFN_REDUC_PLUS;
3702         return true;
3703
3704       case BIT_AND_EXPR:
3705         *reduc_fn = IFN_REDUC_AND;
3706         return true;
3707
3708       case BIT_IOR_EXPR:
3709         *reduc_fn = IFN_REDUC_IOR;
3710         return true;
3711
3712       case BIT_XOR_EXPR:
3713         *reduc_fn = IFN_REDUC_XOR;
3714         return true;
3715
3716       case MULT_EXPR:
3717       case MINUS_EXPR:
3718         *reduc_fn = IFN_LAST;
3719         return true;
3720
3721       default:
3722         return false;
3723       }
3724   else
3725     switch (combined_fn (code))
3726       {
3727       CASE_CFN_FMAX:
3728         *reduc_fn = IFN_REDUC_FMAX;
3729         return true;
3730
3731       CASE_CFN_FMIN:
3732         *reduc_fn = IFN_REDUC_FMIN;
3733         return true;
3734
3735       default:
3736         return false;
3737       }
3738 }
3739
3740 /* If there is a neutral value X such that a reduction would not be affected
3741    by the introduction of additional X elements, return that X, otherwise
3742    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3743    of the scalar elements.  If the reduction has just a single initial value
3744    then INITIAL_VALUE is that value, otherwise it is null.  */
3745
3746 tree
3747 neutral_op_for_reduction (tree scalar_type, code_helper code,
3748                           tree initial_value)
3749 {
3750   if (code.is_tree_code ())
3751     switch (tree_code (code))
3752       {
3753       case WIDEN_SUM_EXPR:
3754       case DOT_PROD_EXPR:
3755       case SAD_EXPR:
3756       case PLUS_EXPR:
3757       case MINUS_EXPR:
3758       case BIT_IOR_EXPR:
3759       case BIT_XOR_EXPR:
3760         return build_zero_cst (scalar_type);
3761
3762       case MULT_EXPR:
3763         return build_one_cst (scalar_type);
3764
3765       case BIT_AND_EXPR:
3766         return build_all_ones_cst (scalar_type);
3767
3768       case MAX_EXPR:
3769       case MIN_EXPR:
3770         return initial_value;
3771
3772       default:
3773         return NULL_TREE;
3774       }
3775   else
3776     switch (combined_fn (code))
3777       {
3778       CASE_CFN_FMIN:
3779       CASE_CFN_FMAX:
3780         return initial_value;
3781
3782       default:
3783         return NULL_TREE;
3784       }
3785 }
3786
3787 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3788    STMT is printed with a message MSG. */
3789
3790 static void
3791 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3792 {
3793   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3794 }
3795
3796 /* Return true if we need an in-order reduction for operation CODE
3797    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3798    overflow must wrap.  */
3799
3800 bool
3801 needs_fold_left_reduction_p (tree type, code_helper code)
3802 {
3803   /* CHECKME: check for !flag_finite_math_only too?  */
3804   if (SCALAR_FLOAT_TYPE_P (type))
3805     {
3806       if (code.is_tree_code ())
3807         switch (tree_code (code))
3808           {
3809           case MIN_EXPR:
3810           case MAX_EXPR:
3811             return false;
3812
3813           default:
3814             return !flag_associative_math;
3815           }
3816       else
3817         switch (combined_fn (code))
3818           {
3819           CASE_CFN_FMIN:
3820           CASE_CFN_FMAX:
3821             return false;
3822
3823           default:
3824             return !flag_associative_math;
3825           }
3826     }
3827
3828   if (INTEGRAL_TYPE_P (type))
3829     return (!code.is_tree_code ()
3830             || !operation_no_trapping_overflow (type, tree_code (code)));
3831
3832   if (SAT_FIXED_POINT_TYPE_P (type))
3833     return true;
3834
3835   return false;
3836 }
3837
3838 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3839    has a handled computation expression.  Store the main reduction
3840    operation in *CODE.  */
3841
3842 static bool
3843 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3844                       tree loop_arg, code_helper *code,
3845                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3846 {
3847   auto_bitmap visited;
3848   tree lookfor = PHI_RESULT (phi);
3849   ssa_op_iter curri;
3850   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3851   while (USE_FROM_PTR (curr) != loop_arg)
3852     curr = op_iter_next_use (&curri);
3853   curri.i = curri.numops;
3854   do
3855     {
3856       path.safe_push (std::make_pair (curri, curr));
3857       tree use = USE_FROM_PTR (curr);
3858       if (use == lookfor)
3859         break;
3860       gimple *def = SSA_NAME_DEF_STMT (use);
3861       if (gimple_nop_p (def)
3862           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3863         {
3864 pop:
3865           do
3866             {
3867               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3868               curri = x.first;
3869               curr = x.second;
3870               do
3871                 curr = op_iter_next_use (&curri);
3872               /* Skip already visited or non-SSA operands (from iterating
3873                  over PHI args).  */
3874               while (curr != NULL_USE_OPERAND_P
3875                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3876                          || ! bitmap_set_bit (visited,
3877                                               SSA_NAME_VERSION
3878                                                 (USE_FROM_PTR (curr)))));
3879             }
3880           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3881           if (curr == NULL_USE_OPERAND_P)
3882             break;
3883         }
3884       else
3885         {
3886           if (gimple_code (def) == GIMPLE_PHI)
3887             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3888           else
3889             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3890           while (curr != NULL_USE_OPERAND_P
3891                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3892                      || ! bitmap_set_bit (visited,
3893                                           SSA_NAME_VERSION
3894                                             (USE_FROM_PTR (curr)))))
3895             curr = op_iter_next_use (&curri);
3896           if (curr == NULL_USE_OPERAND_P)
3897             goto pop;
3898         }
3899     }
3900   while (1);
3901   if (dump_file && (dump_flags & TDF_DETAILS))
3902     {
3903       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3904       unsigned i;
3905       std::pair<ssa_op_iter, use_operand_p> *x;
3906       FOR_EACH_VEC_ELT (path, i, x)
3907         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3908       dump_printf (MSG_NOTE, "\n");
3909     }
3910
3911   /* Check whether the reduction path detected is valid.  */
3912   bool fail = path.length () == 0;
3913   bool neg = false;
3914   int sign = -1;
3915   *code = ERROR_MARK;
3916   for (unsigned i = 1; i < path.length (); ++i)
3917     {
3918       gimple *use_stmt = USE_STMT (path[i].second);
3919       gimple_match_op op;
3920       if (!gimple_extract_op (use_stmt, &op))
3921         {
3922           fail = true;
3923           break;
3924         }
3925       unsigned int opi = op.num_ops;
3926       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3927         {
3928           /* The following make sure we can compute the operand index
3929              easily plus it mostly disallows chaining via COND_EXPR condition
3930              operands.  */
3931           for (opi = 0; opi < op.num_ops; ++opi)
3932             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3933               break;
3934         }
3935       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3936         {
3937           for (opi = 0; opi < op.num_ops; ++opi)
3938             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3939               break;
3940         }
3941       if (opi == op.num_ops)
3942         {
3943           fail = true;
3944           break;
3945         }
3946       op.code = canonicalize_code (op.code, op.type);
3947       if (op.code == MINUS_EXPR)
3948         {
3949           op.code = PLUS_EXPR;
3950           /* Track whether we negate the reduction value each iteration.  */
3951           if (op.ops[1] == op.ops[opi])
3952             neg = ! neg;
3953         }
3954       if (CONVERT_EXPR_CODE_P (op.code)
3955           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3956         ;
3957       else if (*code == ERROR_MARK)
3958         {
3959           *code = op.code;
3960           sign = TYPE_SIGN (op.type);
3961         }
3962       else if (op.code != *code)
3963         {
3964           fail = true;
3965           break;
3966         }
3967       else if ((op.code == MIN_EXPR
3968                 || op.code == MAX_EXPR)
3969                && sign != TYPE_SIGN (op.type))
3970         {
3971           fail = true;
3972           break;
3973         }
3974       /* Check there's only a single stmt the op is used on.  For the
3975          not value-changing tail and the last stmt allow out-of-loop uses.
3976          ???  We could relax this and handle arbitrary live stmts by
3977          forcing a scalar epilogue for example.  */
3978       imm_use_iterator imm_iter;
3979       gimple *op_use_stmt;
3980       unsigned cnt = 0;
3981       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3982         if (!is_gimple_debug (op_use_stmt)
3983             && (*code != ERROR_MARK
3984                 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3985           {
3986             /* We want to allow x + x but not x < 1 ? x : 2.  */
3987             if (is_gimple_assign (op_use_stmt)
3988                 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3989               {
3990                 use_operand_p use_p;
3991                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3992                   cnt++;
3993               }
3994             else
3995               cnt++;
3996           }
3997       if (cnt != 1)
3998         {
3999           fail = true;
4000           break;
4001         }
4002     }
4003   return ! fail && ! neg && *code != ERROR_MARK;
4004 }
4005
4006 bool
4007 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4008                       tree loop_arg, enum tree_code code)
4009 {
4010   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4011   code_helper code_;
4012   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4013           && code_ == code);
4014 }
4015
4016
4017
4018 /* Function vect_is_simple_reduction
4019
4020    (1) Detect a cross-iteration def-use cycle that represents a simple
4021    reduction computation.  We look for the following pattern:
4022
4023    loop_header:
4024      a1 = phi < a0, a2 >
4025      a3 = ...
4026      a2 = operation (a3, a1)
4027
4028    or
4029
4030    a3 = ...
4031    loop_header:
4032      a1 = phi < a0, a2 >
4033      a2 = operation (a3, a1)
4034
4035    such that:
4036    1. operation is commutative and associative and it is safe to
4037       change the order of the computation
4038    2. no uses for a2 in the loop (a2 is used out of the loop)
4039    3. no uses of a1 in the loop besides the reduction operation
4040    4. no uses of a1 outside the loop.
4041
4042    Conditions 1,4 are tested here.
4043    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4044
4045    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4046    nested cycles.
4047
4048    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4049    reductions:
4050
4051      a1 = phi < a0, a2 >
4052      inner loop (def of a3)
4053      a2 = phi < a3 >
4054
4055    (4) Detect condition expressions, ie:
4056      for (int i = 0; i < N; i++)
4057        if (a[i] < val)
4058         ret_val = a[i];
4059
4060 */
4061
4062 static stmt_vec_info
4063 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4064                           bool *double_reduc, bool *reduc_chain_p, bool slp)
4065 {
4066   gphi *phi = as_a <gphi *> (phi_info->stmt);
4067   gimple *phi_use_stmt = NULL;
4068   imm_use_iterator imm_iter;
4069   use_operand_p use_p;
4070
4071   *double_reduc = false;
4072   *reduc_chain_p = false;
4073   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4074
4075   tree phi_name = PHI_RESULT (phi);
4076   /* ???  If there are no uses of the PHI result the inner loop reduction
4077      won't be detected as possibly double-reduction by vectorizable_reduction
4078      because that tries to walk the PHI arg from the preheader edge which
4079      can be constant.  See PR60382.  */
4080   if (has_zero_uses (phi_name))
4081     return NULL;
4082   class loop *loop = (gimple_bb (phi))->loop_father;
4083   unsigned nphi_def_loop_uses = 0;
4084   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4085     {
4086       gimple *use_stmt = USE_STMT (use_p);
4087       if (is_gimple_debug (use_stmt))
4088         continue;
4089
4090       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4091         {
4092           if (dump_enabled_p ())
4093             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4094                              "intermediate value used outside loop.\n");
4095
4096           return NULL;
4097         }
4098
4099       nphi_def_loop_uses++;
4100       phi_use_stmt = use_stmt;
4101     }
4102
4103   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4104   if (TREE_CODE (latch_def) != SSA_NAME)
4105     {
4106       if (dump_enabled_p ())
4107         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4108                          "reduction: not ssa_name: %T\n", latch_def);
4109       return NULL;
4110     }
4111
4112   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4113   if (!def_stmt_info
4114       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4115     return NULL;
4116
4117   bool nested_in_vect_loop
4118     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4119   unsigned nlatch_def_loop_uses = 0;
4120   auto_vec<gphi *, 3> lcphis;
4121   bool inner_loop_of_double_reduc = false;
4122   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4123     {
4124       gimple *use_stmt = USE_STMT (use_p);
4125       if (is_gimple_debug (use_stmt))
4126         continue;
4127       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4128         nlatch_def_loop_uses++;
4129       else
4130         {
4131           /* We can have more than one loop-closed PHI.  */
4132           lcphis.safe_push (as_a <gphi *> (use_stmt));
4133           if (nested_in_vect_loop
4134               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4135                   == vect_double_reduction_def))
4136             inner_loop_of_double_reduc = true;
4137         }
4138     }
4139
4140   /* If we are vectorizing an inner reduction we are executing that
4141      in the original order only in case we are not dealing with a
4142      double reduction.  */
4143   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4144     {
4145       if (dump_enabled_p ())
4146         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4147                         "detected nested cycle: ");
4148       return def_stmt_info;
4149     }
4150
4151   /* When the inner loop of a double reduction ends up with more than
4152      one loop-closed PHI we have failed to classify alternate such
4153      PHIs as double reduction, leading to wrong code.  See PR103237.  */
4154   if (inner_loop_of_double_reduc && lcphis.length () != 1)
4155     {
4156       if (dump_enabled_p ())
4157         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4158                          "unhandle double reduction\n");
4159       return NULL;
4160     }
4161
4162   /* If this isn't a nested cycle or if the nested cycle reduction value
4163      is used ouside of the inner loop we cannot handle uses of the reduction
4164      value.  */
4165   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4166     {
4167       if (dump_enabled_p ())
4168         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4169                          "reduction used in loop.\n");
4170       return NULL;
4171     }
4172
4173   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4174      defined in the inner loop.  */
4175   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4176     {
4177       tree op1 = PHI_ARG_DEF (def_stmt, 0);
4178       if (gimple_phi_num_args (def_stmt) != 1
4179           || TREE_CODE (op1) != SSA_NAME)
4180         {
4181           if (dump_enabled_p ())
4182             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4183                              "unsupported phi node definition.\n");
4184
4185           return NULL;
4186         }
4187
4188       /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4189          and the latch definition op1.  */
4190       gimple *def1 = SSA_NAME_DEF_STMT (op1);
4191       if (gimple_bb (def1)
4192           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4193           && loop->inner
4194           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4195           && (is_gimple_assign (def1) || is_gimple_call (def1))
4196           && is_a <gphi *> (phi_use_stmt)
4197           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4198           && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4199                                             loop_latch_edge (loop->inner))))
4200         {
4201           if (dump_enabled_p ())
4202             report_vect_op (MSG_NOTE, def_stmt,
4203                             "detected double reduction: ");
4204
4205           *double_reduc = true;
4206           return def_stmt_info;
4207         }
4208
4209       return NULL;
4210     }
4211
4212   /* Look for the expression computing latch_def from then loop PHI result.  */
4213   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4214   code_helper code;
4215   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4216                             path))
4217     {
4218       STMT_VINFO_REDUC_CODE (phi_info) = code;
4219       if (code == COND_EXPR && !nested_in_vect_loop)
4220         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4221
4222       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4223          reduction chain for which the additional restriction is that
4224          all operations in the chain are the same.  */
4225       auto_vec<stmt_vec_info, 8> reduc_chain;
4226       unsigned i;
4227       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4228       for (i = path.length () - 1; i >= 1; --i)
4229         {
4230           gimple *stmt = USE_STMT (path[i].second);
4231           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4232           gimple_match_op op;
4233           if (!gimple_extract_op (stmt, &op))
4234             gcc_unreachable ();
4235           if (gassign *assign = dyn_cast<gassign *> (stmt))
4236             STMT_VINFO_REDUC_IDX (stmt_info)
4237               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4238           else
4239             {
4240               gcall *call = as_a<gcall *> (stmt);
4241               STMT_VINFO_REDUC_IDX (stmt_info)
4242                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4243             }
4244           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4245                                      && (i == 1 || i == path.length () - 1));
4246           if ((op.code != code && !leading_conversion)
4247               /* We can only handle the final value in epilogue
4248                  generation for reduction chains.  */
4249               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4250             is_slp_reduc = false;
4251           /* For reduction chains we support a trailing/leading
4252              conversions.  We do not store those in the actual chain.  */
4253           if (leading_conversion)
4254             continue;
4255           reduc_chain.safe_push (stmt_info);
4256         }
4257       if (slp && is_slp_reduc && reduc_chain.length () > 1)
4258         {
4259           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4260             {
4261               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4262               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4263             }
4264           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4265           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4266
4267           /* Save the chain for further analysis in SLP detection.  */
4268           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4269           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4270
4271           *reduc_chain_p = true;
4272           if (dump_enabled_p ())
4273             dump_printf_loc (MSG_NOTE, vect_location,
4274                             "reduction: detected reduction chain\n");
4275         }
4276       else if (dump_enabled_p ())
4277         dump_printf_loc (MSG_NOTE, vect_location,
4278                          "reduction: detected reduction\n");
4279
4280       return def_stmt_info;
4281     }
4282
4283   if (dump_enabled_p ())
4284     dump_printf_loc (MSG_NOTE, vect_location,
4285                      "reduction: unknown pattern\n");
4286
4287   return NULL;
4288 }
4289
4290 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4291    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4292    or -1 if not known.  */
4293
4294 static int
4295 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4296 {
4297   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4298   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4299     {
4300       if (dump_enabled_p ())
4301         dump_printf_loc (MSG_NOTE, vect_location,
4302                          "cost model: epilogue peel iters set to vf/2 "
4303                          "because loop iterations are unknown .\n");
4304       return assumed_vf / 2;
4305     }
4306   else
4307     {
4308       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4309       peel_iters_prologue = MIN (niters, peel_iters_prologue);
4310       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4311       /* If we need to peel for gaps, but no peeling is required, we have to
4312          peel VF iterations.  */
4313       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4314         peel_iters_epilogue = assumed_vf;
4315       return peel_iters_epilogue;
4316     }
4317 }
4318
4319 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
4320 int
4321 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4322                              int *peel_iters_epilogue,
4323                              stmt_vector_for_cost *scalar_cost_vec,
4324                              stmt_vector_for_cost *prologue_cost_vec,
4325                              stmt_vector_for_cost *epilogue_cost_vec)
4326 {
4327   int retval = 0;
4328
4329   *peel_iters_epilogue
4330     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4331
4332   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4333     {
4334       /* If peeled iterations are known but number of scalar loop
4335          iterations are unknown, count a taken branch per peeled loop.  */
4336       if (peel_iters_prologue > 0)
4337         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4338                                    vect_prologue);
4339       if (*peel_iters_epilogue > 0)
4340         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4341                                     vect_epilogue);
4342     }
4343
4344   stmt_info_for_cost *si;
4345   int j;
4346   if (peel_iters_prologue)
4347     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4348       retval += record_stmt_cost (prologue_cost_vec,
4349                                   si->count * peel_iters_prologue,
4350                                   si->kind, si->stmt_info, si->misalign,
4351                                   vect_prologue);
4352   if (*peel_iters_epilogue)
4353     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4354       retval += record_stmt_cost (epilogue_cost_vec,
4355                                   si->count * *peel_iters_epilogue,
4356                                   si->kind, si->stmt_info, si->misalign,
4357                                   vect_epilogue);
4358
4359   return retval;
4360 }
4361
4362 /* Function vect_estimate_min_profitable_iters
4363
4364    Return the number of iterations required for the vector version of the
4365    loop to be profitable relative to the cost of the scalar version of the
4366    loop.
4367
4368    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4369    of iterations for vectorization.  -1 value means loop vectorization
4370    is not profitable.  This returned value may be used for dynamic
4371    profitability check.
4372
4373    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4374    for static check against estimated number of iterations.  */
4375
4376 static void
4377 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4378                                     int *ret_min_profitable_niters,
4379                                     int *ret_min_profitable_estimate,
4380                                     unsigned *suggested_unroll_factor)
4381 {
4382   int min_profitable_iters;
4383   int min_profitable_estimate;
4384   int peel_iters_prologue;
4385   int peel_iters_epilogue;
4386   unsigned vec_inside_cost = 0;
4387   int vec_outside_cost = 0;
4388   unsigned vec_prologue_cost = 0;
4389   unsigned vec_epilogue_cost = 0;
4390   int scalar_single_iter_cost = 0;
4391   int scalar_outside_cost = 0;
4392   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4393   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4394   vector_costs *target_cost_data = loop_vinfo->vector_costs;
4395
4396   /* Cost model disabled.  */
4397   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4398     {
4399       if (dump_enabled_p ())
4400         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4401       *ret_min_profitable_niters = 0;
4402       *ret_min_profitable_estimate = 0;
4403       return;
4404     }
4405
4406   /* Requires loop versioning tests to handle misalignment.  */
4407   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4408     {
4409       /*  FIXME: Make cost depend on complexity of individual check.  */
4410       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4411       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4412       if (dump_enabled_p ())
4413         dump_printf (MSG_NOTE,
4414                      "cost model: Adding cost of checks for loop "
4415                      "versioning to treat misalignment.\n");
4416     }
4417
4418   /* Requires loop versioning with alias checks.  */
4419   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4420     {
4421       /*  FIXME: Make cost depend on complexity of individual check.  */
4422       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4423       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4424       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4425       if (len)
4426         /* Count LEN - 1 ANDs and LEN comparisons.  */
4427         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4428                               scalar_stmt, vect_prologue);
4429       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4430       if (len)
4431         {
4432           /* Count LEN - 1 ANDs and LEN comparisons.  */
4433           unsigned int nstmts = len * 2 - 1;
4434           /* +1 for each bias that needs adding.  */
4435           for (unsigned int i = 0; i < len; ++i)
4436             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4437               nstmts += 1;
4438           (void) add_stmt_cost (target_cost_data, nstmts,
4439                                 scalar_stmt, vect_prologue);
4440         }
4441       if (dump_enabled_p ())
4442         dump_printf (MSG_NOTE,
4443                      "cost model: Adding cost of checks for loop "
4444                      "versioning aliasing.\n");
4445     }
4446
4447   /* Requires loop versioning with niter checks.  */
4448   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4449     {
4450       /*  FIXME: Make cost depend on complexity of individual check.  */
4451       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4452                             NULL, NULL, NULL_TREE, 0, vect_prologue);
4453       if (dump_enabled_p ())
4454         dump_printf (MSG_NOTE,
4455                      "cost model: Adding cost of checks for loop "
4456                      "versioning niters.\n");
4457     }
4458
4459   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4460     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4461                           vect_prologue);
4462
4463   /* Count statements in scalar loop.  Using this as scalar cost for a single
4464      iteration for now.
4465
4466      TODO: Add outer loop support.
4467
4468      TODO: Consider assigning different costs to different scalar
4469      statements.  */
4470
4471   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4472
4473   /* Add additional cost for the peeled instructions in prologue and epilogue
4474      loop.  (For fully-masked loops there will be no peeling.)
4475
4476      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4477      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4478
4479      TODO: Build an expression that represents peel_iters for prologue and
4480      epilogue to be used in a run-time test.  */
4481
4482   bool prologue_need_br_taken_cost = false;
4483   bool prologue_need_br_not_taken_cost = false;
4484
4485   /* Calculate peel_iters_prologue.  */
4486   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4487     peel_iters_prologue = 0;
4488   else if (npeel < 0)
4489     {
4490       peel_iters_prologue = assumed_vf / 2;
4491       if (dump_enabled_p ())
4492         dump_printf (MSG_NOTE, "cost model: "
4493                      "prologue peel iters set to vf/2.\n");
4494
4495       /* If peeled iterations are unknown, count a taken branch and a not taken
4496          branch per peeled loop.  Even if scalar loop iterations are known,
4497          vector iterations are not known since peeled prologue iterations are
4498          not known.  Hence guards remain the same.  */
4499       prologue_need_br_taken_cost = true;
4500       prologue_need_br_not_taken_cost = true;
4501     }
4502   else
4503     {
4504       peel_iters_prologue = npeel;
4505       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4506         /* If peeled iterations are known but number of scalar loop
4507            iterations are unknown, count a taken branch per peeled loop.  */
4508         prologue_need_br_taken_cost = true;
4509     }
4510
4511   bool epilogue_need_br_taken_cost = false;
4512   bool epilogue_need_br_not_taken_cost = false;
4513
4514   /* Calculate peel_iters_epilogue.  */
4515   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4516     /* We need to peel exactly one iteration for gaps.  */
4517     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4518   else if (npeel < 0)
4519     {
4520       /* If peeling for alignment is unknown, loop bound of main loop
4521          becomes unknown.  */
4522       peel_iters_epilogue = assumed_vf / 2;
4523       if (dump_enabled_p ())
4524         dump_printf (MSG_NOTE, "cost model: "
4525                      "epilogue peel iters set to vf/2 because "
4526                      "peeling for alignment is unknown.\n");
4527
4528       /* See the same reason above in peel_iters_prologue calculation.  */
4529       epilogue_need_br_taken_cost = true;
4530       epilogue_need_br_not_taken_cost = true;
4531     }
4532   else
4533     {
4534       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4535       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4536         /* If peeled iterations are known but number of scalar loop
4537            iterations are unknown, count a taken branch per peeled loop.  */
4538         epilogue_need_br_taken_cost = true;
4539     }
4540
4541   stmt_info_for_cost *si;
4542   int j;
4543   /* Add costs associated with peel_iters_prologue.  */
4544   if (peel_iters_prologue)
4545     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4546       {
4547         (void) add_stmt_cost (target_cost_data,
4548                               si->count * peel_iters_prologue, si->kind,
4549                               si->stmt_info, si->node, si->vectype,
4550                               si->misalign, vect_prologue);
4551       }
4552
4553   /* Add costs associated with peel_iters_epilogue.  */
4554   if (peel_iters_epilogue)
4555     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4556       {
4557         (void) add_stmt_cost (target_cost_data,
4558                               si->count * peel_iters_epilogue, si->kind,
4559                               si->stmt_info, si->node, si->vectype,
4560                               si->misalign, vect_epilogue);
4561       }
4562
4563   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4564
4565   if (prologue_need_br_taken_cost)
4566     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4567                           vect_prologue);
4568
4569   if (prologue_need_br_not_taken_cost)
4570     (void) add_stmt_cost (target_cost_data, 1,
4571                           cond_branch_not_taken, vect_prologue);
4572
4573   if (epilogue_need_br_taken_cost)
4574     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4575                           vect_epilogue);
4576
4577   if (epilogue_need_br_not_taken_cost)
4578     (void) add_stmt_cost (target_cost_data, 1,
4579                           cond_branch_not_taken, vect_epilogue);
4580
4581   /* Take care of special costs for rgroup controls of partial vectors.  */
4582   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4583       && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4584           == vect_partial_vectors_avx512))
4585     {
4586       /* Calculate how many masks we need to generate.  */
4587       unsigned int num_masks = 0;
4588       bool need_saturation = false;
4589       for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4590         if (rgm.type)
4591           {
4592             unsigned nvectors = rgm.factor;
4593             num_masks += nvectors;
4594             if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4595                 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4596               need_saturation = true;
4597           }
4598
4599       /* ???  The target isn't able to identify the costs below as
4600          producing masks so it cannot penaltize cases where we'd run
4601          out of mask registers for example.  */
4602
4603       /* ???  We are also failing to account for smaller vector masks
4604          we generate by splitting larger masks in vect_get_loop_mask.  */
4605
4606       /* In the worst case, we need to generate each mask in the prologue
4607          and in the loop body.  We need one splat per group and one
4608          compare per mask.
4609
4610          Sometimes the prologue mask will fold to a constant,
4611          so the actual prologue cost might be smaller.  However, it's
4612          simpler and safer to use the worst-case cost; if this ends up
4613          being the tie-breaker between vectorizing or not, then it's
4614          probably better not to vectorize.  */
4615       (void) add_stmt_cost (target_cost_data,
4616                             num_masks
4617                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4618                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4619                             vect_prologue);
4620       (void) add_stmt_cost (target_cost_data,
4621                             num_masks
4622                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4623                             vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4624
4625       /* When we need saturation we need it both in the prologue and
4626          the epilogue.  */
4627       if (need_saturation)
4628         {
4629           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4630                                 NULL, NULL, NULL_TREE, 0, vect_prologue);
4631           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4632                                 NULL, NULL, NULL_TREE, 0, vect_body);
4633         }
4634     }
4635   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4636            && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4637                == vect_partial_vectors_while_ult))
4638     {
4639       /* Calculate how many masks we need to generate.  */
4640       unsigned int num_masks = 0;
4641       rgroup_controls *rgm;
4642       unsigned int num_vectors_m1;
4643       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4644                         num_vectors_m1, rgm)
4645         if (rgm->type)
4646           num_masks += num_vectors_m1 + 1;
4647       gcc_assert (num_masks > 0);
4648
4649       /* In the worst case, we need to generate each mask in the prologue
4650          and in the loop body.  One of the loop body mask instructions
4651          replaces the comparison in the scalar loop, and since we don't
4652          count the scalar comparison against the scalar body, we shouldn't
4653          count that vector instruction against the vector body either.
4654
4655          Sometimes we can use unpacks instead of generating prologue
4656          masks and sometimes the prologue mask will fold to a constant,
4657          so the actual prologue cost might be smaller.  However, it's
4658          simpler and safer to use the worst-case cost; if this ends up
4659          being the tie-breaker between vectorizing or not, then it's
4660          probably better not to vectorize.  */
4661       (void) add_stmt_cost (target_cost_data, num_masks,
4662                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4663                             vect_prologue);
4664       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4665                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4666                             vect_body);
4667     }
4668   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4669     {
4670       /* Referring to the functions vect_set_loop_condition_partial_vectors
4671          and vect_set_loop_controls_directly, we need to generate each
4672          length in the prologue and in the loop body if required. Although
4673          there are some possible optimizations, we consider the worst case
4674          here.  */
4675
4676       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4677       signed char partial_load_store_bias
4678         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4679       bool need_iterate_p
4680         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4681            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4682
4683       /* Calculate how many statements to be added.  */
4684       unsigned int prologue_stmts = 0;
4685       unsigned int body_stmts = 0;
4686
4687       rgroup_controls *rgc;
4688       unsigned int num_vectors_m1;
4689       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4690         if (rgc->type)
4691           {
4692             /* May need one SHIFT for nitems_total computation.  */
4693             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4694             if (nitems != 1 && !niters_known_p)
4695               prologue_stmts += 1;
4696
4697             /* May need one MAX and one MINUS for wrap around.  */
4698             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4699               prologue_stmts += 2;
4700
4701             /* Need one MAX and one MINUS for each batch limit excepting for
4702                the 1st one.  */
4703             prologue_stmts += num_vectors_m1 * 2;
4704
4705             unsigned int num_vectors = num_vectors_m1 + 1;
4706
4707             /* Need to set up lengths in prologue, only one MIN required
4708                for each since start index is zero.  */
4709             prologue_stmts += num_vectors;
4710
4711             /* If we have a non-zero partial load bias, we need one PLUS
4712                to adjust the load length.  */
4713             if (partial_load_store_bias != 0)
4714               body_stmts += 1;
4715
4716             /* Each may need two MINs and one MINUS to update lengths in body
4717                for next iteration.  */
4718             if (need_iterate_p)
4719               body_stmts += 3 * num_vectors;
4720           }
4721
4722       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4723                             scalar_stmt, vect_prologue);
4724       (void) add_stmt_cost (target_cost_data, body_stmts,
4725                             scalar_stmt, vect_body);
4726     }
4727
4728   /* FORNOW: The scalar outside cost is incremented in one of the
4729      following ways:
4730
4731      1. The vectorizer checks for alignment and aliasing and generates
4732      a condition that allows dynamic vectorization.  A cost model
4733      check is ANDED with the versioning condition.  Hence scalar code
4734      path now has the added cost of the versioning check.
4735
4736        if (cost > th & versioning_check)
4737          jmp to vector code
4738
4739      Hence run-time scalar is incremented by not-taken branch cost.
4740
4741      2. The vectorizer then checks if a prologue is required.  If the
4742      cost model check was not done before during versioning, it has to
4743      be done before the prologue check.
4744
4745        if (cost <= th)
4746          prologue = scalar_iters
4747        if (prologue == 0)
4748          jmp to vector code
4749        else
4750          execute prologue
4751        if (prologue == num_iters)
4752          go to exit
4753
4754      Hence the run-time scalar cost is incremented by a taken branch,
4755      plus a not-taken branch, plus a taken branch cost.
4756
4757      3. The vectorizer then checks if an epilogue is required.  If the
4758      cost model check was not done before during prologue check, it
4759      has to be done with the epilogue check.
4760
4761        if (prologue == 0)
4762          jmp to vector code
4763        else
4764          execute prologue
4765        if (prologue == num_iters)
4766          go to exit
4767        vector code:
4768          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4769            jmp to epilogue
4770
4771      Hence the run-time scalar cost should be incremented by 2 taken
4772      branches.
4773
4774      TODO: The back end may reorder the BBS's differently and reverse
4775      conditions/branch directions.  Change the estimates below to
4776      something more reasonable.  */
4777
4778   /* If the number of iterations is known and we do not do versioning, we can
4779      decide whether to vectorize at compile time.  Hence the scalar version
4780      do not carry cost model guard costs.  */
4781   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4782       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4783     {
4784       /* Cost model check occurs at versioning.  */
4785       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4786         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4787       else
4788         {
4789           /* Cost model check occurs at prologue generation.  */
4790           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4791             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4792               + vect_get_stmt_cost (cond_branch_not_taken);
4793           /* Cost model check occurs at epilogue generation.  */
4794           else
4795             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4796         }
4797     }
4798
4799   /* Complete the target-specific cost calculations.  */
4800   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4801                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4802                suggested_unroll_factor);
4803
4804   if (suggested_unroll_factor && *suggested_unroll_factor > 1
4805       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4806       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4807                     *suggested_unroll_factor,
4808                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4809     {
4810       if (dump_enabled_p ())
4811         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4812                          "can't unroll as unrolled vectorization factor larger"
4813                          " than maximum vectorization factor: "
4814                          HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4815                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4816       *suggested_unroll_factor = 1;
4817     }
4818
4819   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4820
4821   if (dump_enabled_p ())
4822     {
4823       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4824       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4825                    vec_inside_cost);
4826       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4827                    vec_prologue_cost);
4828       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4829                    vec_epilogue_cost);
4830       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4831                    scalar_single_iter_cost);
4832       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4833                    scalar_outside_cost);
4834       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4835                    vec_outside_cost);
4836       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4837                    peel_iters_prologue);
4838       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4839                    peel_iters_epilogue);
4840     }
4841
4842   /* Calculate number of iterations required to make the vector version
4843      profitable, relative to the loop bodies only.  The following condition
4844      must hold true:
4845      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4846      where
4847      SIC = scalar iteration cost, VIC = vector iteration cost,
4848      VOC = vector outside cost, VF = vectorization factor,
4849      NPEEL = prologue iterations + epilogue iterations,
4850      SOC = scalar outside cost for run time cost model check.  */
4851
4852   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4853                           - vec_inside_cost);
4854   if (saving_per_viter <= 0)
4855     {
4856       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4857         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4858                     "vectorization did not happen for a simd loop");
4859
4860       if (dump_enabled_p ())
4861         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4862                          "cost model: the vector iteration cost = %d "
4863                          "divided by the scalar iteration cost = %d "
4864                          "is greater or equal to the vectorization factor = %d"
4865                          ".\n",
4866                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4867       *ret_min_profitable_niters = -1;
4868       *ret_min_profitable_estimate = -1;
4869       return;
4870     }
4871
4872   /* ??? The "if" arm is written to handle all cases; see below for what
4873      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4874   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4875     {
4876       /* Rewriting the condition above in terms of the number of
4877          vector iterations (vniters) rather than the number of
4878          scalar iterations (niters) gives:
4879
4880          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4881
4882          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4883
4884          For integer N, X and Y when X > 0:
4885
4886          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4887       int outside_overhead = (vec_outside_cost
4888                               - scalar_single_iter_cost * peel_iters_prologue
4889                               - scalar_single_iter_cost * peel_iters_epilogue
4890                               - scalar_outside_cost);
4891       /* We're only interested in cases that require at least one
4892          vector iteration.  */
4893       int min_vec_niters = 1;
4894       if (outside_overhead > 0)
4895         min_vec_niters = outside_overhead / saving_per_viter + 1;
4896
4897       if (dump_enabled_p ())
4898         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4899                      min_vec_niters);
4900
4901       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4902         {
4903           /* Now that we know the minimum number of vector iterations,
4904              find the minimum niters for which the scalar cost is larger:
4905
4906              SIC * niters > VIC * vniters + VOC - SOC
4907
4908              We know that the minimum niters is no more than
4909              vniters * VF + NPEEL, but it might be (and often is) less
4910              than that if a partial vector iteration is cheaper than the
4911              equivalent scalar code.  */
4912           int threshold = (vec_inside_cost * min_vec_niters
4913                            + vec_outside_cost
4914                            - scalar_outside_cost);
4915           if (threshold <= 0)
4916             min_profitable_iters = 1;
4917           else
4918             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4919         }
4920       else
4921         /* Convert the number of vector iterations into a number of
4922            scalar iterations.  */
4923         min_profitable_iters = (min_vec_niters * assumed_vf
4924                                 + peel_iters_prologue
4925                                 + peel_iters_epilogue);
4926     }
4927   else
4928     {
4929       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4930                               * assumed_vf
4931                               - vec_inside_cost * peel_iters_prologue
4932                               - vec_inside_cost * peel_iters_epilogue);
4933       if (min_profitable_iters <= 0)
4934         min_profitable_iters = 0;
4935       else
4936         {
4937           min_profitable_iters /= saving_per_viter;
4938
4939           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4940               <= (((int) vec_inside_cost * min_profitable_iters)
4941                   + (((int) vec_outside_cost - scalar_outside_cost)
4942                      * assumed_vf)))
4943             min_profitable_iters++;
4944         }
4945     }
4946
4947   if (dump_enabled_p ())
4948     dump_printf (MSG_NOTE,
4949                  "  Calculated minimum iters for profitability: %d\n",
4950                  min_profitable_iters);
4951
4952   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4953       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4954     /* We want the vectorized loop to execute at least once.  */
4955     min_profitable_iters = assumed_vf + peel_iters_prologue;
4956   else if (min_profitable_iters < peel_iters_prologue)
4957     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4958        vectorized loop executes at least once.  */
4959     min_profitable_iters = peel_iters_prologue;
4960
4961   if (dump_enabled_p ())
4962     dump_printf_loc (MSG_NOTE, vect_location,
4963                      "  Runtime profitability threshold = %d\n",
4964                      min_profitable_iters);
4965
4966   *ret_min_profitable_niters = min_profitable_iters;
4967
4968   /* Calculate number of iterations required to make the vector version
4969      profitable, relative to the loop bodies only.
4970
4971      Non-vectorized variant is SIC * niters and it must win over vector
4972      variant on the expected loop trip count.  The following condition must hold true:
4973      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4974
4975   if (vec_outside_cost <= 0)
4976     min_profitable_estimate = 0;
4977   /* ??? This "else if" arm is written to handle all cases; see below for
4978      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4979   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4980     {
4981       /* This is a repeat of the code above, but with + SOC rather
4982          than - SOC.  */
4983       int outside_overhead = (vec_outside_cost
4984                               - scalar_single_iter_cost * peel_iters_prologue
4985                               - scalar_single_iter_cost * peel_iters_epilogue
4986                               + scalar_outside_cost);
4987       int min_vec_niters = 1;
4988       if (outside_overhead > 0)
4989         min_vec_niters = outside_overhead / saving_per_viter + 1;
4990
4991       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4992         {
4993           int threshold = (vec_inside_cost * min_vec_niters
4994                            + vec_outside_cost
4995                            + scalar_outside_cost);
4996           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4997         }
4998       else
4999         min_profitable_estimate = (min_vec_niters * assumed_vf
5000                                    + peel_iters_prologue
5001                                    + peel_iters_epilogue);
5002     }
5003   else
5004     {
5005       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5006                                  * assumed_vf
5007                                  - vec_inside_cost * peel_iters_prologue
5008                                  - vec_inside_cost * peel_iters_epilogue)
5009                                  / ((scalar_single_iter_cost * assumed_vf)
5010                                    - vec_inside_cost);
5011     }
5012   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5013   if (dump_enabled_p ())
5014     dump_printf_loc (MSG_NOTE, vect_location,
5015                      "  Static estimate profitability threshold = %d\n",
5016                      min_profitable_estimate);
5017
5018   *ret_min_profitable_estimate = min_profitable_estimate;
5019 }
5020
5021 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5022    vector elements (not bits) for a vector with NELT elements.  */
5023 static void
5024 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5025                               vec_perm_builder *sel)
5026 {
5027   /* The encoding is a single stepped pattern.  Any wrap-around is handled
5028      by vec_perm_indices.  */
5029   sel->new_vector (nelt, 1, 3);
5030   for (unsigned int i = 0; i < 3; i++)
5031     sel->quick_push (i + offset);
5032 }
5033
5034 /* Checks whether the target supports whole-vector shifts for vectors of mode
5035    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
5036    it supports vec_perm_const with masks for all necessary shift amounts.  */
5037 static bool
5038 have_whole_vector_shift (machine_mode mode)
5039 {
5040   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5041     return true;
5042
5043   /* Variable-length vectors should be handled via the optab.  */
5044   unsigned int nelt;
5045   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5046     return false;
5047
5048   vec_perm_builder sel;
5049   vec_perm_indices indices;
5050   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5051     {
5052       calc_vec_perm_mask_for_shift (i, nelt, &sel);
5053       indices.new_vector (sel, 2, nelt);
5054       if (!can_vec_perm_const_p (mode, mode, indices, false))
5055         return false;
5056     }
5057   return true;
5058 }
5059
5060 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5061    multiplication operands have differing signs and (b) we intend
5062    to emulate the operation using a series of signed DOT_PROD_EXPRs.
5063    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
5064
5065 static bool
5066 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5067                                  stmt_vec_info stmt_info)
5068 {
5069   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5070   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5071     return false;
5072
5073   tree rhs1 = gimple_assign_rhs1 (assign);
5074   tree rhs2 = gimple_assign_rhs2 (assign);
5075   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5076     return false;
5077
5078   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5079   gcc_assert (reduc_info->is_reduc_info);
5080   return !directly_supported_p (DOT_PROD_EXPR,
5081                                 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5082                                 optab_vector_mixed_sign);
5083 }
5084
5085 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5086    functions. Design better to avoid maintenance issues.  */
5087
5088 /* Function vect_model_reduction_cost.
5089
5090    Models cost for a reduction operation, including the vector ops
5091    generated within the strip-mine loop in some cases, the initial
5092    definition before the loop, and the epilogue code that must be generated.  */
5093
5094 static void
5095 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5096                            stmt_vec_info stmt_info, internal_fn reduc_fn,
5097                            vect_reduction_type reduction_type,
5098                            int ncopies, stmt_vector_for_cost *cost_vec)
5099 {
5100   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5101   tree vectype;
5102   machine_mode mode;
5103   class loop *loop = NULL;
5104
5105   if (loop_vinfo)
5106     loop = LOOP_VINFO_LOOP (loop_vinfo);
5107
5108   /* Condition reductions generate two reductions in the loop.  */
5109   if (reduction_type == COND_REDUCTION)
5110     ncopies *= 2;
5111
5112   vectype = STMT_VINFO_VECTYPE (stmt_info);
5113   mode = TYPE_MODE (vectype);
5114   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5115
5116   gimple_match_op op;
5117   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5118     gcc_unreachable ();
5119
5120   bool emulated_mixed_dot_prod
5121     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5122   if (reduction_type == EXTRACT_LAST_REDUCTION)
5123     /* No extra instructions are needed in the prologue.  The loop body
5124        operations are costed in vectorizable_condition.  */
5125     inside_cost = 0;
5126   else if (reduction_type == FOLD_LEFT_REDUCTION)
5127     {
5128       /* No extra instructions needed in the prologue.  */
5129       prologue_cost = 0;
5130
5131       if (reduc_fn != IFN_LAST)
5132         /* Count one reduction-like operation per vector.  */
5133         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5134                                         stmt_info, 0, vect_body);
5135       else
5136         {
5137           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
5138           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5139           inside_cost = record_stmt_cost (cost_vec, nelements,
5140                                           vec_to_scalar, stmt_info, 0,
5141                                           vect_body);
5142           inside_cost += record_stmt_cost (cost_vec, nelements,
5143                                            scalar_stmt, stmt_info, 0,
5144                                            vect_body);
5145         }
5146     }
5147   else
5148     {
5149       /* Add in the cost of the initial definitions.  */
5150       int prologue_stmts;
5151       if (reduction_type == COND_REDUCTION)
5152         /* For cond reductions we have four vectors: initial index, step,
5153            initial result of the data reduction, initial value of the index
5154            reduction.  */
5155         prologue_stmts = 4;
5156       else if (emulated_mixed_dot_prod)
5157         /* We need the initial reduction value and two invariants:
5158            one that contains the minimum signed value and one that
5159            contains half of its negative.  */
5160         prologue_stmts = 3;
5161       else
5162         prologue_stmts = 1;
5163       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5164                                          scalar_to_vec, stmt_info, 0,
5165                                          vect_prologue);
5166     }
5167
5168   /* Determine cost of epilogue code.
5169
5170      We have a reduction operator that will reduce the vector in one statement.
5171      Also requires scalar extract.  */
5172
5173   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5174     {
5175       if (reduc_fn != IFN_LAST)
5176         {
5177           if (reduction_type == COND_REDUCTION)
5178             {
5179               /* An EQ stmt and an COND_EXPR stmt.  */
5180               epilogue_cost += record_stmt_cost (cost_vec, 2,
5181                                                  vector_stmt, stmt_info, 0,
5182                                                  vect_epilogue);
5183               /* Reduction of the max index and a reduction of the found
5184                  values.  */
5185               epilogue_cost += record_stmt_cost (cost_vec, 2,
5186                                                  vec_to_scalar, stmt_info, 0,
5187                                                  vect_epilogue);
5188               /* A broadcast of the max value.  */
5189               epilogue_cost += record_stmt_cost (cost_vec, 1,
5190                                                  scalar_to_vec, stmt_info, 0,
5191                                                  vect_epilogue);
5192             }
5193           else
5194             {
5195               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5196                                                  stmt_info, 0, vect_epilogue);
5197               epilogue_cost += record_stmt_cost (cost_vec, 1,
5198                                                  vec_to_scalar, stmt_info, 0,
5199                                                  vect_epilogue);
5200             }
5201         }
5202       else if (reduction_type == COND_REDUCTION)
5203         {
5204           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5205           /* Extraction of scalar elements.  */
5206           epilogue_cost += record_stmt_cost (cost_vec,
5207                                              2 * estimated_nunits,
5208                                              vec_to_scalar, stmt_info, 0,
5209                                              vect_epilogue);
5210           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
5211           epilogue_cost += record_stmt_cost (cost_vec,
5212                                              2 * estimated_nunits - 3,
5213                                              scalar_stmt, stmt_info, 0,
5214                                              vect_epilogue);
5215         }
5216       else if (reduction_type == EXTRACT_LAST_REDUCTION
5217                || reduction_type == FOLD_LEFT_REDUCTION)
5218         /* No extra instructions need in the epilogue.  */
5219         ;
5220       else
5221         {
5222           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5223           tree bitsize = TYPE_SIZE (op.type);
5224           int element_bitsize = tree_to_uhwi (bitsize);
5225           int nelements = vec_size_in_bits / element_bitsize;
5226
5227           if (op.code == COND_EXPR)
5228             op.code = MAX_EXPR;
5229
5230           /* We have a whole vector shift available.  */
5231           if (VECTOR_MODE_P (mode)
5232               && directly_supported_p (op.code, vectype)
5233               && have_whole_vector_shift (mode))
5234             {
5235               /* Final reduction via vector shifts and the reduction operator.
5236                  Also requires scalar extract.  */
5237               epilogue_cost += record_stmt_cost (cost_vec,
5238                                                  exact_log2 (nelements) * 2,
5239                                                  vector_stmt, stmt_info, 0,
5240                                                  vect_epilogue);
5241               epilogue_cost += record_stmt_cost (cost_vec, 1,
5242                                                  vec_to_scalar, stmt_info, 0,
5243                                                  vect_epilogue);
5244             }
5245           else
5246             /* Use extracts and reduction op for final reduction.  For N
5247                elements, we have N extracts and N-1 reduction ops.  */
5248             epilogue_cost += record_stmt_cost (cost_vec,
5249                                                nelements + nelements - 1,
5250                                                vector_stmt, stmt_info, 0,
5251                                                vect_epilogue);
5252         }
5253     }
5254
5255   if (dump_enabled_p ())
5256     dump_printf (MSG_NOTE,
5257                  "vect_model_reduction_cost: inside_cost = %d, "
5258                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5259                  prologue_cost, epilogue_cost);
5260 }
5261
5262 /* SEQ is a sequence of instructions that initialize the reduction
5263    described by REDUC_INFO.  Emit them in the appropriate place.  */
5264
5265 static void
5266 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5267                                 stmt_vec_info reduc_info, gimple *seq)
5268 {
5269   if (reduc_info->reused_accumulator)
5270     {
5271       /* When reusing an accumulator from the main loop, we only need
5272          initialization instructions if the main loop can be skipped.
5273          In that case, emit the initialization instructions at the end
5274          of the guard block that does the skip.  */
5275       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5276       gcc_assert (skip_edge);
5277       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5278       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5279     }
5280   else
5281     {
5282       /* The normal case: emit the initialization instructions on the
5283          preheader edge.  */
5284       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5285       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5286     }
5287 }
5288
5289 /* Function get_initial_def_for_reduction
5290
5291    Input:
5292    REDUC_INFO - the info_for_reduction
5293    INIT_VAL - the initial value of the reduction variable
5294    NEUTRAL_OP - a value that has no effect on the reduction, as per
5295                 neutral_op_for_reduction
5296
5297    Output:
5298    Return a vector variable, initialized according to the operation that
5299         STMT_VINFO performs. This vector will be used as the initial value
5300         of the vector of partial results.
5301
5302    The value we need is a vector in which element 0 has value INIT_VAL
5303    and every other element has value NEUTRAL_OP.  */
5304
5305 static tree
5306 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5307                                stmt_vec_info reduc_info,
5308                                tree init_val, tree neutral_op)
5309 {
5310   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5311   tree scalar_type = TREE_TYPE (init_val);
5312   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5313   tree init_def;
5314   gimple_seq stmts = NULL;
5315
5316   gcc_assert (vectype);
5317
5318   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5319               || SCALAR_FLOAT_TYPE_P (scalar_type));
5320
5321   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5322               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5323
5324   if (operand_equal_p (init_val, neutral_op))
5325     {
5326       /* If both elements are equal then the vector described above is
5327          just a splat.  */
5328       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5329       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5330     }
5331   else
5332     {
5333       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5334       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5335       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5336         {
5337           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5338              element 0.  */
5339           init_def = gimple_build_vector_from_val (&stmts, vectype,
5340                                                    neutral_op);
5341           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5342                                    vectype, init_def, init_val);
5343         }
5344       else
5345         {
5346           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
5347           tree_vector_builder elts (vectype, 1, 2);
5348           elts.quick_push (init_val);
5349           elts.quick_push (neutral_op);
5350           init_def = gimple_build_vector (&stmts, &elts);
5351         }
5352     }
5353
5354   if (stmts)
5355     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5356   return init_def;
5357 }
5358
5359 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5360    which performs a reduction involving GROUP_SIZE scalar statements.
5361    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
5362    is nonnull, introducing extra elements of that value will not change the
5363    result.  */
5364
5365 static void
5366 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5367                                 stmt_vec_info reduc_info,
5368                                 vec<tree> *vec_oprnds,
5369                                 unsigned int number_of_vectors,
5370                                 unsigned int group_size, tree neutral_op)
5371 {
5372   vec<tree> &initial_values = reduc_info->reduc_initial_values;
5373   unsigned HOST_WIDE_INT nunits;
5374   unsigned j, number_of_places_left_in_vector;
5375   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5376   unsigned int i;
5377
5378   gcc_assert (group_size == initial_values.length () || neutral_op);
5379
5380   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5381      created vectors. It is greater than 1 if unrolling is performed.
5382
5383      For example, we have two scalar operands, s1 and s2 (e.g., group of
5384      strided accesses of size two), while NUNITS is four (i.e., four scalars
5385      of this type can be packed in a vector).  The output vector will contain
5386      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
5387      will be 2).
5388
5389      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5390      vectors containing the operands.
5391
5392      For example, NUNITS is four as before, and the group size is 8
5393      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
5394      {s5, s6, s7, s8}.  */
5395
5396   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5397     nunits = group_size;
5398
5399   number_of_places_left_in_vector = nunits;
5400   bool constant_p = true;
5401   tree_vector_builder elts (vector_type, nunits, 1);
5402   elts.quick_grow (nunits);
5403   gimple_seq ctor_seq = NULL;
5404   for (j = 0; j < nunits * number_of_vectors; ++j)
5405     {
5406       tree op;
5407       i = j % group_size;
5408
5409       /* Get the def before the loop.  In reduction chain we have only
5410          one initial value.  Else we have as many as PHIs in the group.  */
5411       if (i >= initial_values.length () || (j > i && neutral_op))
5412         op = neutral_op;
5413       else
5414         op = initial_values[i];
5415
5416       /* Create 'vect_ = {op0,op1,...,opn}'.  */
5417       number_of_places_left_in_vector--;
5418       elts[nunits - number_of_places_left_in_vector - 1] = op;
5419       if (!CONSTANT_CLASS_P (op))
5420         constant_p = false;
5421
5422       if (number_of_places_left_in_vector == 0)
5423         {
5424           tree init;
5425           if (constant_p && !neutral_op
5426               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5427               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5428             /* Build the vector directly from ELTS.  */
5429             init = gimple_build_vector (&ctor_seq, &elts);
5430           else if (neutral_op)
5431             {
5432               /* Build a vector of the neutral value and shift the
5433                  other elements into place.  */
5434               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5435                                                    neutral_op);
5436               int k = nunits;
5437               while (k > 0 && elts[k - 1] == neutral_op)
5438                 k -= 1;
5439               while (k > 0)
5440                 {
5441                   k -= 1;
5442                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5443                                        vector_type, init, elts[k]);
5444                 }
5445             }
5446           else
5447             {
5448               /* First time round, duplicate ELTS to fill the
5449                  required number of vectors.  */
5450               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5451                                         elts, number_of_vectors, *vec_oprnds);
5452               break;
5453             }
5454           vec_oprnds->quick_push (init);
5455
5456           number_of_places_left_in_vector = nunits;
5457           elts.new_vector (vector_type, nunits, 1);
5458           elts.quick_grow (nunits);
5459           constant_p = true;
5460         }
5461     }
5462   if (ctor_seq != NULL)
5463     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5464 }
5465
5466 /* For a statement STMT_INFO taking part in a reduction operation return
5467    the stmt_vec_info the meta information is stored on.  */
5468
5469 stmt_vec_info
5470 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5471 {
5472   stmt_info = vect_orig_stmt (stmt_info);
5473   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5474   if (!is_a <gphi *> (stmt_info->stmt)
5475       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5476     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5477   gphi *phi = as_a <gphi *> (stmt_info->stmt);
5478   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5479     {
5480       if (gimple_phi_num_args (phi) == 1)
5481         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5482     }
5483   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5484     {
5485       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5486       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5487         stmt_info = info;
5488     }
5489   return stmt_info;
5490 }
5491
5492 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5493    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
5494    return false.  */
5495
5496 static bool
5497 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5498                                 stmt_vec_info reduc_info)
5499 {
5500   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5501   if (!main_loop_vinfo)
5502     return false;
5503
5504   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5505     return false;
5506
5507   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5508   auto_vec<tree, 16> main_loop_results (num_phis);
5509   auto_vec<tree, 16> initial_values (num_phis);
5510   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5511     {
5512       /* The epilogue loop can be entered either from the main loop or
5513          from an earlier guard block.  */
5514       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5515       for (tree incoming_value : reduc_info->reduc_initial_values)
5516         {
5517           /* Look for:
5518
5519                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5520                                     INITIAL_VALUE(guard block)>.  */
5521           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5522
5523           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5524           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5525
5526           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5527           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5528
5529           main_loop_results.quick_push (from_main_loop);
5530           initial_values.quick_push (from_skip);
5531         }
5532     }
5533   else
5534     /* The main loop dominates the epilogue loop.  */
5535     main_loop_results.splice (reduc_info->reduc_initial_values);
5536
5537   /* See if the main loop has the kind of accumulator we need.  */
5538   vect_reusable_accumulator *accumulator
5539     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5540   if (!accumulator
5541       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5542       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5543                       accumulator->reduc_info->reduc_scalar_results.begin ()))
5544     return false;
5545
5546   /* Handle the case where we can reduce wider vectors to narrower ones.  */
5547   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5548   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5549   unsigned HOST_WIDE_INT m;
5550   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5551                             TYPE_VECTOR_SUBPARTS (vectype), &m))
5552     return false;
5553   /* Check the intermediate vector types and operations are available.  */
5554   tree prev_vectype = old_vectype;
5555   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5556   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5557     {
5558       intermediate_nunits = exact_div (intermediate_nunits, 2);
5559       tree intermediate_vectype = get_related_vectype_for_scalar_type
5560         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5561       if (!intermediate_vectype
5562           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5563                                     intermediate_vectype)
5564           || !can_vec_extract (TYPE_MODE (prev_vectype),
5565                                TYPE_MODE (intermediate_vectype)))
5566         return false;
5567       prev_vectype = intermediate_vectype;
5568     }
5569
5570   /* Non-SLP reductions might apply an adjustment after the reduction
5571      operation, in order to simplify the initialization of the accumulator.
5572      If the epilogue loop carries on from where the main loop left off,
5573      it should apply the same adjustment to the final reduction result.
5574
5575      If the epilogue loop can also be entered directly (rather than via
5576      the main loop), we need to be able to handle that case in the same way,
5577      with the same adjustment.  (In principle we could add a PHI node
5578      to select the correct adjustment, but in practice that shouldn't be
5579      necessary.)  */
5580   tree main_adjustment
5581     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5582   if (loop_vinfo->main_loop_edge && main_adjustment)
5583     {
5584       gcc_assert (num_phis == 1);
5585       tree initial_value = initial_values[0];
5586       /* Check that we can use INITIAL_VALUE as the adjustment and
5587          initialize the accumulator with a neutral value instead.  */
5588       if (!operand_equal_p (initial_value, main_adjustment))
5589         return false;
5590       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5591       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5592                                                     code, initial_value);
5593     }
5594   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5595   reduc_info->reduc_initial_values.truncate (0);
5596   reduc_info->reduc_initial_values.splice (initial_values);
5597   reduc_info->reused_accumulator = accumulator;
5598   return true;
5599 }
5600
5601 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5602    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5603
5604 static tree
5605 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5606                             gimple_seq *seq)
5607 {
5608   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5609   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5610   tree stype = TREE_TYPE (vectype);
5611   tree new_temp = vec_def;
5612   while (nunits > nunits1)
5613     {
5614       nunits /= 2;
5615       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5616                                                            stype, nunits);
5617       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5618
5619       /* The target has to make sure we support lowpart/highpart
5620          extraction, either via direct vector extract or through
5621          an integer mode punning.  */
5622       tree dst1, dst2;
5623       gimple *epilog_stmt;
5624       if (convert_optab_handler (vec_extract_optab,
5625                                  TYPE_MODE (TREE_TYPE (new_temp)),
5626                                  TYPE_MODE (vectype1))
5627           != CODE_FOR_nothing)
5628         {
5629           /* Extract sub-vectors directly once vec_extract becomes
5630              a conversion optab.  */
5631           dst1 = make_ssa_name (vectype1);
5632           epilog_stmt
5633               = gimple_build_assign (dst1, BIT_FIELD_REF,
5634                                      build3 (BIT_FIELD_REF, vectype1,
5635                                              new_temp, TYPE_SIZE (vectype1),
5636                                              bitsize_int (0)));
5637           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5638           dst2 =  make_ssa_name (vectype1);
5639           epilog_stmt
5640               = gimple_build_assign (dst2, BIT_FIELD_REF,
5641                                      build3 (BIT_FIELD_REF, vectype1,
5642                                              new_temp, TYPE_SIZE (vectype1),
5643                                              bitsize_int (bitsize)));
5644           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5645         }
5646       else
5647         {
5648           /* Extract via punning to appropriately sized integer mode
5649              vector.  */
5650           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5651           tree etype = build_vector_type (eltype, 2);
5652           gcc_assert (convert_optab_handler (vec_extract_optab,
5653                                              TYPE_MODE (etype),
5654                                              TYPE_MODE (eltype))
5655                       != CODE_FOR_nothing);
5656           tree tem = make_ssa_name (etype);
5657           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5658                                              build1 (VIEW_CONVERT_EXPR,
5659                                                      etype, new_temp));
5660           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5661           new_temp = tem;
5662           tem = make_ssa_name (eltype);
5663           epilog_stmt
5664               = gimple_build_assign (tem, BIT_FIELD_REF,
5665                                      build3 (BIT_FIELD_REF, eltype,
5666                                              new_temp, TYPE_SIZE (eltype),
5667                                              bitsize_int (0)));
5668           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5669           dst1 = make_ssa_name (vectype1);
5670           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5671                                              build1 (VIEW_CONVERT_EXPR,
5672                                                      vectype1, tem));
5673           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5674           tem = make_ssa_name (eltype);
5675           epilog_stmt
5676               = gimple_build_assign (tem, BIT_FIELD_REF,
5677                                      build3 (BIT_FIELD_REF, eltype,
5678                                              new_temp, TYPE_SIZE (eltype),
5679                                              bitsize_int (bitsize)));
5680           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5681           dst2 =  make_ssa_name (vectype1);
5682           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5683                                              build1 (VIEW_CONVERT_EXPR,
5684                                                      vectype1, tem));
5685           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5686         }
5687
5688       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5689     }
5690
5691   return new_temp;
5692 }
5693
5694 /* Function vect_create_epilog_for_reduction
5695
5696    Create code at the loop-epilog to finalize the result of a reduction
5697    computation.
5698
5699    STMT_INFO is the scalar reduction stmt that is being vectorized.
5700    SLP_NODE is an SLP node containing a group of reduction statements. The
5701      first one in this group is STMT_INFO.
5702    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5703    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5704      (counting from 0)
5705
5706    This function:
5707    1. Completes the reduction def-use cycles.
5708    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5709       by calling the function specified by REDUC_FN if available, or by
5710       other means (whole-vector shifts or a scalar loop).
5711       The function also creates a new phi node at the loop exit to preserve
5712       loop-closed form, as illustrated below.
5713
5714      The flow at the entry to this function:
5715
5716         loop:
5717           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5718           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5719           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5720         loop_exit:
5721           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5722           use <s_out0>
5723           use <s_out0>
5724
5725      The above is transformed by this function into:
5726
5727         loop:
5728           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5729           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5730           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5731         loop_exit:
5732           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5733           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5734           v_out2 = reduce <v_out1>
5735           s_out3 = extract_field <v_out2, 0>
5736           s_out4 = adjust_result <s_out3>
5737           use <s_out4>
5738           use <s_out4>
5739 */
5740
5741 static void
5742 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5743                                   stmt_vec_info stmt_info,
5744                                   slp_tree slp_node,
5745                                   slp_instance slp_node_instance)
5746 {
5747   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5748   gcc_assert (reduc_info->is_reduc_info);
5749   /* For double reductions we need to get at the inner loop reduction
5750      stmt which has the meta info attached.  Our stmt_info is that of the
5751      loop-closed PHI of the inner loop which we remember as
5752      def for the reduction PHI generation.  */
5753   bool double_reduc = false;
5754   stmt_vec_info rdef_info = stmt_info;
5755   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5756     {
5757       gcc_assert (!slp_node);
5758       double_reduc = true;
5759       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5760                                             (stmt_info->stmt, 0));
5761       stmt_info = vect_stmt_to_vectorize (stmt_info);
5762     }
5763   gphi *reduc_def_stmt
5764     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5765   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5766   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5767   tree vectype;
5768   machine_mode mode;
5769   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5770   basic_block exit_bb;
5771   tree scalar_dest;
5772   tree scalar_type;
5773   gimple *new_phi = NULL, *phi;
5774   gimple_stmt_iterator exit_gsi;
5775   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5776   gimple *epilog_stmt = NULL;
5777   gimple *exit_phi;
5778   tree bitsize;
5779   tree def;
5780   tree orig_name, scalar_result;
5781   imm_use_iterator imm_iter, phi_imm_iter;
5782   use_operand_p use_p, phi_use_p;
5783   gimple *use_stmt;
5784   auto_vec<tree> reduc_inputs;
5785   int j, i;
5786   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5787   unsigned int group_size = 1, k;
5788   auto_vec<gimple *> phis;
5789   /* SLP reduction without reduction chain, e.g.,
5790      # a1 = phi <a2, a0>
5791      # b1 = phi <b2, b0>
5792      a2 = operation (a1)
5793      b2 = operation (b1)  */
5794   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5795   bool direct_slp_reduc;
5796   tree induction_index = NULL_TREE;
5797
5798   if (slp_node)
5799     group_size = SLP_TREE_LANES (slp_node);
5800
5801   if (nested_in_vect_loop_p (loop, stmt_info))
5802     {
5803       outer_loop = loop;
5804       loop = loop->inner;
5805       gcc_assert (!slp_node && double_reduc);
5806     }
5807
5808   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5809   gcc_assert (vectype);
5810   mode = TYPE_MODE (vectype);
5811
5812   tree induc_val = NULL_TREE;
5813   tree adjustment_def = NULL;
5814   if (slp_node)
5815     ;
5816   else
5817     {
5818       /* Optimize: for induction condition reduction, if we can't use zero
5819          for induc_val, use initial_def.  */
5820       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5821         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5822       else if (double_reduc)
5823         ;
5824       else
5825         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5826     }
5827
5828   stmt_vec_info single_live_out_stmt[] = { stmt_info };
5829   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5830   if (slp_reduc)
5831     /* All statements produce live-out values.  */
5832     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5833   else if (slp_node)
5834     {
5835       /* The last statement in the reduction chain produces the live-out
5836          value.  Note SLP optimization can shuffle scalar stmts to
5837          optimize permutations so we have to search for the last stmt.  */
5838       for (k = 0; k < group_size; ++k)
5839         if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5840           {
5841             single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5842             break;
5843           }
5844     }
5845
5846   unsigned vec_num;
5847   int ncopies;
5848   if (slp_node)
5849     {
5850       vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
5851       ncopies = 1;
5852     }
5853   else
5854     {
5855       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5856       vec_num = 1;
5857       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5858     }
5859
5860   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5861      which is updated with the current index of the loop for every match of
5862      the original loop's cond_expr (VEC_STMT).  This results in a vector
5863      containing the last time the condition passed for that vector lane.
5864      The first match will be a 1 to allow 0 to be used for non-matching
5865      indexes.  If there are no matches at all then the vector will be all
5866      zeroes.
5867
5868      PR92772: This algorithm is broken for architectures that support
5869      masked vectors, but do not provide fold_extract_last.  */
5870   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5871     {
5872       auto_vec<std::pair<tree, bool>, 2> ccompares;
5873       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5874       cond_info = vect_stmt_to_vectorize (cond_info);
5875       while (cond_info != reduc_info)
5876         {
5877           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5878             {
5879               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5880               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5881               ccompares.safe_push
5882                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5883                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
5884             }
5885           cond_info
5886             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5887                                                  1 + STMT_VINFO_REDUC_IDX
5888                                                         (cond_info)));
5889           cond_info = vect_stmt_to_vectorize (cond_info);
5890         }
5891       gcc_assert (ccompares.length () != 0);
5892
5893       tree indx_before_incr, indx_after_incr;
5894       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5895       int scalar_precision
5896         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5897       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5898       tree cr_index_vector_type = get_related_vectype_for_scalar_type
5899         (TYPE_MODE (vectype), cr_index_scalar_type,
5900          TYPE_VECTOR_SUBPARTS (vectype));
5901
5902       /* First we create a simple vector induction variable which starts
5903          with the values {1,2,3,...} (SERIES_VECT) and increments by the
5904          vector size (STEP).  */
5905
5906       /* Create a {1,2,3,...} vector.  */
5907       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5908
5909       /* Create a vector of the step value.  */
5910       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5911       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5912
5913       /* Create an induction variable.  */
5914       gimple_stmt_iterator incr_gsi;
5915       bool insert_after;
5916       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5917       create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
5918                  insert_after, &indx_before_incr, &indx_after_incr);
5919
5920       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5921          filled with zeros (VEC_ZERO).  */
5922
5923       /* Create a vector of 0s.  */
5924       tree zero = build_zero_cst (cr_index_scalar_type);
5925       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5926
5927       /* Create a vector phi node.  */
5928       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5929       new_phi = create_phi_node (new_phi_tree, loop->header);
5930       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5931                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
5932
5933       /* Now take the condition from the loops original cond_exprs
5934          and produce a new cond_exprs (INDEX_COND_EXPR) which for
5935          every match uses values from the induction variable
5936          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5937          (NEW_PHI_TREE).
5938          Finally, we update the phi (NEW_PHI_TREE) to take the value of
5939          the new cond_expr (INDEX_COND_EXPR).  */
5940       gimple_seq stmts = NULL;
5941       for (int i = ccompares.length () - 1; i != -1; --i)
5942         {
5943           tree ccompare = ccompares[i].first;
5944           if (ccompares[i].second)
5945             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5946                                          cr_index_vector_type,
5947                                          ccompare,
5948                                          indx_before_incr, new_phi_tree);
5949           else
5950             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5951                                          cr_index_vector_type,
5952                                          ccompare,
5953                                          new_phi_tree, indx_before_incr);
5954         }
5955       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5956
5957       /* Update the phi with the vec cond.  */
5958       induction_index = new_phi_tree;
5959       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5960                    loop_latch_edge (loop), UNKNOWN_LOCATION);
5961     }
5962
5963   /* 2. Create epilog code.
5964         The reduction epilog code operates across the elements of the vector
5965         of partial results computed by the vectorized loop.
5966         The reduction epilog code consists of:
5967
5968         step 1: compute the scalar result in a vector (v_out2)
5969         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5970         step 3: adjust the scalar result (s_out3) if needed.
5971
5972         Step 1 can be accomplished using one the following three schemes:
5973           (scheme 1) using reduc_fn, if available.
5974           (scheme 2) using whole-vector shifts, if available.
5975           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5976                      combined.
5977
5978           The overall epilog code looks like this:
5979
5980           s_out0 = phi <s_loop>         # original EXIT_PHI
5981           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5982           v_out2 = reduce <v_out1>              # step 1
5983           s_out3 = extract_field <v_out2, 0>    # step 2
5984           s_out4 = adjust_result <s_out3>       # step 3
5985
5986           (step 3 is optional, and steps 1 and 2 may be combined).
5987           Lastly, the uses of s_out0 are replaced by s_out4.  */
5988
5989
5990   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5991          v_out1 = phi <VECT_DEF>
5992          Store them in NEW_PHIS.  */
5993   if (double_reduc)
5994     loop = outer_loop;
5995   exit_bb = single_exit (loop)->dest;
5996   exit_gsi = gsi_after_labels (exit_bb);
5997   reduc_inputs.create (slp_node ? vec_num : ncopies);
5998   for (unsigned i = 0; i < vec_num; i++)
5999     {
6000       gimple_seq stmts = NULL;
6001       if (slp_node)
6002         def = vect_get_slp_vect_def (slp_node, i);
6003       else
6004         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
6005       for (j = 0; j < ncopies; j++)
6006         {
6007           tree new_def = copy_ssa_name (def);
6008           phi = create_phi_node (new_def, exit_bb);
6009           if (j)
6010             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
6011           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
6012           new_def = gimple_convert (&stmts, vectype, new_def);
6013           reduc_inputs.quick_push (new_def);
6014         }
6015       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6016     }
6017
6018   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6019          (i.e. when reduc_fn is not available) and in the final adjustment
6020          code (if needed).  Also get the original scalar reduction variable as
6021          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
6022          represents a reduction pattern), the tree-code and scalar-def are
6023          taken from the original stmt that the pattern-stmt (STMT) replaces.
6024          Otherwise (it is a regular reduction) - the tree-code and scalar-def
6025          are taken from STMT.  */
6026
6027   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6028   if (orig_stmt_info != stmt_info)
6029     {
6030       /* Reduction pattern  */
6031       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6032       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6033     }
6034
6035   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6036   scalar_type = TREE_TYPE (scalar_dest);
6037   scalar_results.truncate (0);
6038   scalar_results.reserve_exact (group_size);
6039   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6040   bitsize = TYPE_SIZE (scalar_type);
6041
6042   /* True if we should implement SLP_REDUC using native reduction operations
6043      instead of scalar operations.  */
6044   direct_slp_reduc = (reduc_fn != IFN_LAST
6045                       && slp_reduc
6046                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6047
6048   /* In case of reduction chain, e.g.,
6049      # a1 = phi <a3, a0>
6050      a2 = operation (a1)
6051      a3 = operation (a2),
6052
6053      we may end up with more than one vector result.  Here we reduce them
6054      to one vector.
6055
6056      The same is true for a SLP reduction, e.g.,
6057      # a1 = phi <a2, a0>
6058      # b1 = phi <b2, b0>
6059      a2 = operation (a1)
6060      b2 = operation (a2),
6061
6062      where we can end up with more than one vector as well.  We can
6063      easily accumulate vectors when the number of vector elements is
6064      a multiple of the SLP group size.
6065
6066      The same is true if we couldn't use a single defuse cycle.  */
6067   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6068       || direct_slp_reduc
6069       || (slp_reduc
6070           && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6071       || ncopies > 1)
6072     {
6073       gimple_seq stmts = NULL;
6074       tree single_input = reduc_inputs[0];
6075       for (k = 1; k < reduc_inputs.length (); k++)
6076         single_input = gimple_build (&stmts, code, vectype,
6077                                      single_input, reduc_inputs[k]);
6078       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6079
6080       reduc_inputs.truncate (0);
6081       reduc_inputs.safe_push (single_input);
6082     }
6083
6084   tree orig_reduc_input = reduc_inputs[0];
6085
6086   /* If this loop is an epilogue loop that can be skipped after the
6087      main loop, we can only share a reduction operation between the
6088      main loop and the epilogue if we put it at the target of the
6089      skip edge.
6090
6091      We can still reuse accumulators if this check fails.  Doing so has
6092      the minor(?) benefit of making the epilogue loop's scalar result
6093      independent of the main loop's scalar result.  */
6094   bool unify_with_main_loop_p = false;
6095   if (reduc_info->reused_accumulator
6096       && loop_vinfo->skip_this_loop_edge
6097       && single_succ_p (exit_bb)
6098       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6099     {
6100       unify_with_main_loop_p = true;
6101
6102       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6103       reduc_inputs[0] = make_ssa_name (vectype);
6104       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6105       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6106                    UNKNOWN_LOCATION);
6107       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6108                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6109       exit_gsi = gsi_after_labels (reduc_block);
6110     }
6111
6112   /* Shouldn't be used beyond this point.  */
6113   exit_bb = nullptr;
6114
6115   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6116       && reduc_fn != IFN_LAST)
6117     {
6118       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6119          various data values where the condition matched and another vector
6120          (INDUCTION_INDEX) containing all the indexes of those matches.  We
6121          need to extract the last matching index (which will be the index with
6122          highest value) and use this to index into the data vector.
6123          For the case where there were no matches, the data vector will contain
6124          all default values and the index vector will be all zeros.  */
6125
6126       /* Get various versions of the type of the vector of indexes.  */
6127       tree index_vec_type = TREE_TYPE (induction_index);
6128       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6129       tree index_scalar_type = TREE_TYPE (index_vec_type);
6130       tree index_vec_cmp_type = truth_type_for (index_vec_type);
6131
6132       /* Get an unsigned integer version of the type of the data vector.  */
6133       int scalar_precision
6134         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6135       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6136       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6137                                                 vectype);
6138
6139       /* First we need to create a vector (ZERO_VEC) of zeros and another
6140          vector (MAX_INDEX_VEC) filled with the last matching index, which we
6141          can create using a MAX reduction and then expanding.
6142          In the case where the loop never made any matches, the max index will
6143          be zero.  */
6144
6145       /* Vector of {0, 0, 0,...}.  */
6146       tree zero_vec = build_zero_cst (vectype);
6147
6148       /* Find maximum value from the vector of found indexes.  */
6149       tree max_index = make_ssa_name (index_scalar_type);
6150       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6151                                                           1, induction_index);
6152       gimple_call_set_lhs (max_index_stmt, max_index);
6153       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6154
6155       /* Vector of {max_index, max_index, max_index,...}.  */
6156       tree max_index_vec = make_ssa_name (index_vec_type);
6157       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6158                                                       max_index);
6159       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6160                                                         max_index_vec_rhs);
6161       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6162
6163       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6164          with the vector (INDUCTION_INDEX) of found indexes, choosing values
6165          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6166          otherwise.  Only one value should match, resulting in a vector
6167          (VEC_COND) with one data value and the rest zeros.
6168          In the case where the loop never made any matches, every index will
6169          match, resulting in a vector with all data values (which will all be
6170          the default value).  */
6171
6172       /* Compare the max index vector to the vector of found indexes to find
6173          the position of the max value.  */
6174       tree vec_compare = make_ssa_name (index_vec_cmp_type);
6175       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6176                                                       induction_index,
6177                                                       max_index_vec);
6178       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6179
6180       /* Use the compare to choose either values from the data vector or
6181          zero.  */
6182       tree vec_cond = make_ssa_name (vectype);
6183       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6184                                                    vec_compare,
6185                                                    reduc_inputs[0],
6186                                                    zero_vec);
6187       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6188
6189       /* Finally we need to extract the data value from the vector (VEC_COND)
6190          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
6191          reduction, but because this doesn't exist, we can use a MAX reduction
6192          instead.  The data value might be signed or a float so we need to cast
6193          it first.
6194          In the case where the loop never made any matches, the data values are
6195          all identical, and so will reduce down correctly.  */
6196
6197       /* Make the matched data values unsigned.  */
6198       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6199       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6200                                        vec_cond);
6201       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6202                                                         VIEW_CONVERT_EXPR,
6203                                                         vec_cond_cast_rhs);
6204       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6205
6206       /* Reduce down to a scalar value.  */
6207       tree data_reduc = make_ssa_name (scalar_type_unsigned);
6208       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6209                                                            1, vec_cond_cast);
6210       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6211       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6212
6213       /* Convert the reduced value back to the result type and set as the
6214          result.  */
6215       gimple_seq stmts = NULL;
6216       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6217                                data_reduc);
6218       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6219       scalar_results.safe_push (new_temp);
6220     }
6221   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6222            && reduc_fn == IFN_LAST)
6223     {
6224       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
6225          idx = 0;
6226          idx_val = induction_index[0];
6227          val = data_reduc[0];
6228          for (idx = 0, val = init, i = 0; i < nelts; ++i)
6229            if (induction_index[i] > idx_val)
6230              val = data_reduc[i], idx_val = induction_index[i];
6231          return val;  */
6232
6233       tree data_eltype = TREE_TYPE (vectype);
6234       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6235       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6236       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6237       /* Enforced by vectorizable_reduction, which ensures we have target
6238          support before allowing a conditional reduction on variable-length
6239          vectors.  */
6240       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6241       tree idx_val = NULL_TREE, val = NULL_TREE;
6242       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6243         {
6244           tree old_idx_val = idx_val;
6245           tree old_val = val;
6246           idx_val = make_ssa_name (idx_eltype);
6247           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6248                                              build3 (BIT_FIELD_REF, idx_eltype,
6249                                                      induction_index,
6250                                                      bitsize_int (el_size),
6251                                                      bitsize_int (off)));
6252           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6253           val = make_ssa_name (data_eltype);
6254           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6255                                              build3 (BIT_FIELD_REF,
6256                                                      data_eltype,
6257                                                      reduc_inputs[0],
6258                                                      bitsize_int (el_size),
6259                                                      bitsize_int (off)));
6260           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6261           if (off != 0)
6262             {
6263               tree new_idx_val = idx_val;
6264               if (off != v_size - el_size)
6265                 {
6266                   new_idx_val = make_ssa_name (idx_eltype);
6267                   epilog_stmt = gimple_build_assign (new_idx_val,
6268                                                      MAX_EXPR, idx_val,
6269                                                      old_idx_val);
6270                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6271                 }
6272               tree cond = make_ssa_name (boolean_type_node);
6273               epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6274                                                  idx_val, old_idx_val);
6275               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6276               tree new_val = make_ssa_name (data_eltype);
6277               epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6278                                                  cond, val, old_val);
6279               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6280               idx_val = new_idx_val;
6281               val = new_val;
6282             }
6283         }
6284       /* Convert the reduced value back to the result type and set as the
6285          result.  */
6286       gimple_seq stmts = NULL;
6287       val = gimple_convert (&stmts, scalar_type, val);
6288       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6289       scalar_results.safe_push (val);
6290     }
6291
6292   /* 2.3 Create the reduction code, using one of the three schemes described
6293          above. In SLP we simply need to extract all the elements from the
6294          vector (without reducing them), so we use scalar shifts.  */
6295   else if (reduc_fn != IFN_LAST && !slp_reduc)
6296     {
6297       tree tmp;
6298       tree vec_elem_type;
6299
6300       /* Case 1:  Create:
6301          v_out2 = reduc_expr <v_out1>  */
6302
6303       if (dump_enabled_p ())
6304         dump_printf_loc (MSG_NOTE, vect_location,
6305                          "Reduce using direct vector reduction.\n");
6306
6307       gimple_seq stmts = NULL;
6308       vec_elem_type = TREE_TYPE (vectype);
6309       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6310                                vec_elem_type, reduc_inputs[0]);
6311       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6312       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6313
6314       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6315           && induc_val)
6316         {
6317           /* Earlier we set the initial value to be a vector if induc_val
6318              values.  Check the result and if it is induc_val then replace
6319              with the original initial value, unless induc_val is
6320              the same as initial_def already.  */
6321           tree zcompare = make_ssa_name (boolean_type_node);
6322           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6323                                              new_temp, induc_val);
6324           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6325           tree initial_def = reduc_info->reduc_initial_values[0];
6326           tmp = make_ssa_name (new_scalar_dest);
6327           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6328                                              initial_def, new_temp);
6329           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6330           new_temp = tmp;
6331         }
6332
6333       scalar_results.safe_push (new_temp);
6334     }
6335   else if (direct_slp_reduc)
6336     {
6337       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6338          with the elements for other SLP statements replaced with the
6339          neutral value.  We can then do a normal reduction on each vector.  */
6340
6341       /* Enforced by vectorizable_reduction.  */
6342       gcc_assert (reduc_inputs.length () == 1);
6343       gcc_assert (pow2p_hwi (group_size));
6344
6345       gimple_seq seq = NULL;
6346
6347       /* Build a vector {0, 1, 2, ...}, with the same number of elements
6348          and the same element size as VECTYPE.  */
6349       tree index = build_index_vector (vectype, 0, 1);
6350       tree index_type = TREE_TYPE (index);
6351       tree index_elt_type = TREE_TYPE (index_type);
6352       tree mask_type = truth_type_for (index_type);
6353
6354       /* Create a vector that, for each element, identifies which of
6355          the REDUC_GROUP_SIZE results should use it.  */
6356       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6357       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6358                             build_vector_from_val (index_type, index_mask));
6359
6360       /* Get a neutral vector value.  This is simply a splat of the neutral
6361          scalar value if we have one, otherwise the initial scalar value
6362          is itself a neutral value.  */
6363       tree vector_identity = NULL_TREE;
6364       tree neutral_op = NULL_TREE;
6365       if (slp_node)
6366         {
6367           tree initial_value = NULL_TREE;
6368           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6369             initial_value = reduc_info->reduc_initial_values[0];
6370           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6371                                                  initial_value);
6372         }
6373       if (neutral_op)
6374         vector_identity = gimple_build_vector_from_val (&seq, vectype,
6375                                                         neutral_op);
6376       for (unsigned int i = 0; i < group_size; ++i)
6377         {
6378           /* If there's no univeral neutral value, we can use the
6379              initial scalar value from the original PHI.  This is used
6380              for MIN and MAX reduction, for example.  */
6381           if (!neutral_op)
6382             {
6383               tree scalar_value = reduc_info->reduc_initial_values[i];
6384               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6385                                              scalar_value);
6386               vector_identity = gimple_build_vector_from_val (&seq, vectype,
6387                                                               scalar_value);
6388             }
6389
6390           /* Calculate the equivalent of:
6391
6392              sel[j] = (index[j] == i);
6393
6394              which selects the elements of REDUC_INPUTS[0] that should
6395              be included in the result.  */
6396           tree compare_val = build_int_cst (index_elt_type, i);
6397           compare_val = build_vector_from_val (index_type, compare_val);
6398           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6399                                    index, compare_val);
6400
6401           /* Calculate the equivalent of:
6402
6403              vec = seq ? reduc_inputs[0] : vector_identity;
6404
6405              VEC is now suitable for a full vector reduction.  */
6406           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6407                                    sel, reduc_inputs[0], vector_identity);
6408
6409           /* Do the reduction and convert it to the appropriate type.  */
6410           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6411                                       TREE_TYPE (vectype), vec);
6412           scalar = gimple_convert (&seq, scalar_type, scalar);
6413           scalar_results.safe_push (scalar);
6414         }
6415       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6416     }
6417   else
6418     {
6419       bool reduce_with_shift;
6420       tree vec_temp;
6421
6422       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6423
6424       /* See if the target wants to do the final (shift) reduction
6425          in a vector mode of smaller size and first reduce upper/lower
6426          halves against each other.  */
6427       enum machine_mode mode1 = mode;
6428       tree stype = TREE_TYPE (vectype);
6429       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6430       unsigned nunits1 = nunits;
6431       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6432           && reduc_inputs.length () == 1)
6433         {
6434           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6435           /* For SLP reductions we have to make sure lanes match up, but
6436              since we're doing individual element final reduction reducing
6437              vector width here is even more important.
6438              ???  We can also separate lanes with permutes, for the common
6439              case of power-of-two group-size odd/even extracts would work.  */
6440           if (slp_reduc && nunits != nunits1)
6441             {
6442               nunits1 = least_common_multiple (nunits1, group_size);
6443               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6444             }
6445         }
6446       if (!slp_reduc
6447           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6448         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6449
6450       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6451                                                            stype, nunits1);
6452       reduce_with_shift = have_whole_vector_shift (mode1);
6453       if (!VECTOR_MODE_P (mode1)
6454           || !directly_supported_p (code, vectype1))
6455         reduce_with_shift = false;
6456
6457       /* First reduce the vector to the desired vector size we should
6458          do shift reduction on by combining upper and lower halves.  */
6459       gimple_seq stmts = NULL;
6460       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6461                                              code, &stmts);
6462       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6463       reduc_inputs[0] = new_temp;
6464
6465       if (reduce_with_shift && !slp_reduc)
6466         {
6467           int element_bitsize = tree_to_uhwi (bitsize);
6468           /* Enforced by vectorizable_reduction, which disallows SLP reductions
6469              for variable-length vectors and also requires direct target support
6470              for loop reductions.  */
6471           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6472           int nelements = vec_size_in_bits / element_bitsize;
6473           vec_perm_builder sel;
6474           vec_perm_indices indices;
6475
6476           int elt_offset;
6477
6478           tree zero_vec = build_zero_cst (vectype1);
6479           /* Case 2: Create:
6480              for (offset = nelements/2; offset >= 1; offset/=2)
6481                 {
6482                   Create:  va' = vec_shift <va, offset>
6483                   Create:  va = vop <va, va'>
6484                 }  */
6485
6486           tree rhs;
6487
6488           if (dump_enabled_p ())
6489             dump_printf_loc (MSG_NOTE, vect_location,
6490                              "Reduce using vector shifts\n");
6491
6492           gimple_seq stmts = NULL;
6493           new_temp = gimple_convert (&stmts, vectype1, new_temp);
6494           for (elt_offset = nelements / 2;
6495                elt_offset >= 1;
6496                elt_offset /= 2)
6497             {
6498               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6499               indices.new_vector (sel, 2, nelements);
6500               tree mask = vect_gen_perm_mask_any (vectype1, indices);
6501               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6502                                        new_temp, zero_vec, mask);
6503               new_temp = gimple_build (&stmts, code,
6504                                        vectype1, new_name, new_temp);
6505             }
6506           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6507
6508           /* 2.4  Extract the final scalar result.  Create:
6509              s_out3 = extract_field <v_out2, bitpos>  */
6510
6511           if (dump_enabled_p ())
6512             dump_printf_loc (MSG_NOTE, vect_location,
6513                              "extract scalar result\n");
6514
6515           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6516                         bitsize, bitsize_zero_node);
6517           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6518           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6519           gimple_assign_set_lhs (epilog_stmt, new_temp);
6520           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6521           scalar_results.safe_push (new_temp);
6522         }
6523       else
6524         {
6525           /* Case 3: Create:
6526              s = extract_field <v_out2, 0>
6527              for (offset = element_size;
6528                   offset < vector_size;
6529                   offset += element_size;)
6530                {
6531                  Create:  s' = extract_field <v_out2, offset>
6532                  Create:  s = op <s, s'>  // For non SLP cases
6533                }  */
6534
6535           if (dump_enabled_p ())
6536             dump_printf_loc (MSG_NOTE, vect_location,
6537                              "Reduce using scalar code.\n");
6538
6539           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6540           int element_bitsize = tree_to_uhwi (bitsize);
6541           tree compute_type = TREE_TYPE (vectype);
6542           gimple_seq stmts = NULL;
6543           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6544             {
6545               int bit_offset;
6546               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6547                                        vec_temp, bitsize, bitsize_zero_node);
6548
6549               /* In SLP we don't need to apply reduction operation, so we just
6550                  collect s' values in SCALAR_RESULTS.  */
6551               if (slp_reduc)
6552                 scalar_results.safe_push (new_temp);
6553
6554               for (bit_offset = element_bitsize;
6555                    bit_offset < vec_size_in_bits;
6556                    bit_offset += element_bitsize)
6557                 {
6558                   tree bitpos = bitsize_int (bit_offset);
6559                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
6560                                            compute_type, vec_temp,
6561                                            bitsize, bitpos);
6562                   if (slp_reduc)
6563                     {
6564                       /* In SLP we don't need to apply reduction operation, so
6565                          we just collect s' values in SCALAR_RESULTS.  */
6566                       new_temp = new_name;
6567                       scalar_results.safe_push (new_name);
6568                     }
6569                   else
6570                     new_temp = gimple_build (&stmts, code, compute_type,
6571                                              new_name, new_temp);
6572                 }
6573             }
6574
6575           /* The only case where we need to reduce scalar results in SLP, is
6576              unrolling.  If the size of SCALAR_RESULTS is greater than
6577              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6578              REDUC_GROUP_SIZE.  */
6579           if (slp_reduc)
6580             {
6581               tree res, first_res, new_res;
6582
6583               /* Reduce multiple scalar results in case of SLP unrolling.  */
6584               for (j = group_size; scalar_results.iterate (j, &res);
6585                    j++)
6586                 {
6587                   first_res = scalar_results[j % group_size];
6588                   new_res = gimple_build (&stmts, code, compute_type,
6589                                           first_res, res);
6590                   scalar_results[j % group_size] = new_res;
6591                 }
6592               scalar_results.truncate (group_size);
6593               for (k = 0; k < group_size; k++)
6594                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6595                                                     scalar_results[k]);
6596             }
6597           else
6598             {
6599               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6600               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6601               scalar_results.safe_push (new_temp);
6602             }
6603
6604           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6605         }
6606
6607       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6608           && induc_val)
6609         {
6610           /* Earlier we set the initial value to be a vector if induc_val
6611              values.  Check the result and if it is induc_val then replace
6612              with the original initial value, unless induc_val is
6613              the same as initial_def already.  */
6614           tree zcompare = make_ssa_name (boolean_type_node);
6615           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6616                                              induc_val);
6617           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6618           tree initial_def = reduc_info->reduc_initial_values[0];
6619           tree tmp = make_ssa_name (new_scalar_dest);
6620           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6621                                              initial_def, new_temp);
6622           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6623           scalar_results[0] = tmp;
6624         }
6625     }
6626
6627   /* 2.5 Adjust the final result by the initial value of the reduction
6628          variable. (When such adjustment is not needed, then
6629          'adjustment_def' is zero).  For example, if code is PLUS we create:
6630          new_temp = loop_exit_def + adjustment_def  */
6631
6632   if (adjustment_def)
6633     {
6634       gcc_assert (!slp_reduc);
6635       gimple_seq stmts = NULL;
6636       if (double_reduc)
6637         {
6638           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6639           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6640           new_temp = gimple_build (&stmts, code, vectype,
6641                                    reduc_inputs[0], adjustment_def);
6642         }
6643       else
6644         {
6645           new_temp = scalar_results[0];
6646           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6647           adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6648                                            adjustment_def);
6649           new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6650           new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6651                                    new_temp, adjustment_def);
6652           new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6653         }
6654
6655       epilog_stmt = gimple_seq_last_stmt (stmts);
6656       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6657       scalar_results[0] = new_temp;
6658     }
6659
6660   /* Record this operation if it could be reused by the epilogue loop.  */
6661   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6662       && reduc_inputs.length () == 1)
6663     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6664                                            { orig_reduc_input, reduc_info });
6665
6666   if (double_reduc)
6667     loop = outer_loop;
6668
6669   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6670           phis with new adjusted scalar results, i.e., replace use <s_out0>
6671           with use <s_out4>.
6672
6673      Transform:
6674         loop_exit:
6675           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6676           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6677           v_out2 = reduce <v_out1>
6678           s_out3 = extract_field <v_out2, 0>
6679           s_out4 = adjust_result <s_out3>
6680           use <s_out0>
6681           use <s_out0>
6682
6683      into:
6684
6685         loop_exit:
6686           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6687           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6688           v_out2 = reduce <v_out1>
6689           s_out3 = extract_field <v_out2, 0>
6690           s_out4 = adjust_result <s_out3>
6691           use <s_out4>
6692           use <s_out4> */
6693
6694   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6695   for (k = 0; k < live_out_stmts.size (); k++)
6696     {
6697       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6698       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6699
6700       phis.create (3);
6701       /* Find the loop-closed-use at the loop exit of the original scalar
6702          result.  (The reduction result is expected to have two immediate uses,
6703          one at the latch block, and one at the loop exit).  For double
6704          reductions we are looking for exit phis of the outer loop.  */
6705       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6706         {
6707           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6708             {
6709               if (!is_gimple_debug (USE_STMT (use_p)))
6710                 phis.safe_push (USE_STMT (use_p));
6711             }
6712           else
6713             {
6714               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6715                 {
6716                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6717
6718                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6719                     {
6720                       if (!flow_bb_inside_loop_p (loop,
6721                                              gimple_bb (USE_STMT (phi_use_p)))
6722                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6723                         phis.safe_push (USE_STMT (phi_use_p));
6724                     }
6725                 }
6726             }
6727         }
6728
6729       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6730         {
6731           /* Replace the uses:  */
6732           orig_name = PHI_RESULT (exit_phi);
6733
6734           /* Look for a single use at the target of the skip edge.  */
6735           if (unify_with_main_loop_p)
6736             {
6737               use_operand_p use_p;
6738               gimple *user;
6739               if (!single_imm_use (orig_name, &use_p, &user))
6740                 gcc_unreachable ();
6741               orig_name = gimple_get_lhs (user);
6742             }
6743
6744           scalar_result = scalar_results[k];
6745           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6746             {
6747               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6748                 SET_USE (use_p, scalar_result);
6749               update_stmt (use_stmt);
6750             }
6751         }
6752
6753       phis.release ();
6754     }
6755 }
6756
6757 /* Return a vector of type VECTYPE that is equal to the vector select
6758    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6759    before GSI.  */
6760
6761 static tree
6762 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6763                      tree vec, tree identity)
6764 {
6765   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6766   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6767                                           mask, vec, identity);
6768   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6769   return cond;
6770 }
6771
6772 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6773    order, starting with LHS.  Insert the extraction statements before GSI and
6774    associate the new scalar SSA names with variable SCALAR_DEST.
6775    Return the SSA name for the result.  */
6776
6777 static tree
6778 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6779                        tree_code code, tree lhs, tree vector_rhs)
6780 {
6781   tree vectype = TREE_TYPE (vector_rhs);
6782   tree scalar_type = TREE_TYPE (vectype);
6783   tree bitsize = TYPE_SIZE (scalar_type);
6784   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6785   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6786
6787   for (unsigned HOST_WIDE_INT bit_offset = 0;
6788        bit_offset < vec_size_in_bits;
6789        bit_offset += element_bitsize)
6790     {
6791       tree bitpos = bitsize_int (bit_offset);
6792       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6793                          bitsize, bitpos);
6794
6795       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6796       rhs = make_ssa_name (scalar_dest, stmt);
6797       gimple_assign_set_lhs (stmt, rhs);
6798       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6799
6800       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6801       tree new_name = make_ssa_name (scalar_dest, stmt);
6802       gimple_assign_set_lhs (stmt, new_name);
6803       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6804       lhs = new_name;
6805     }
6806   return lhs;
6807 }
6808
6809 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6810    type of the vector input.  */
6811
6812 static internal_fn
6813 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6814 {
6815   internal_fn mask_reduc_fn;
6816   internal_fn mask_len_reduc_fn;
6817
6818   switch (reduc_fn)
6819     {
6820     case IFN_FOLD_LEFT_PLUS:
6821       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6822       mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6823       break;
6824
6825     default:
6826       return IFN_LAST;
6827     }
6828
6829   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6830                                       OPTIMIZE_FOR_SPEED))
6831     return mask_reduc_fn;
6832   if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
6833                                       OPTIMIZE_FOR_SPEED))
6834     return mask_len_reduc_fn;
6835   return IFN_LAST;
6836 }
6837
6838 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6839    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6840    statement.  CODE is the operation performed by STMT_INFO and OPS are
6841    its scalar operands.  REDUC_INDEX is the index of the operand in
6842    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6843    implements in-order reduction, or IFN_LAST if we should open-code it.
6844    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6845    that should be used to control the operation in a fully-masked loop.  */
6846
6847 static bool
6848 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6849                                stmt_vec_info stmt_info,
6850                                gimple_stmt_iterator *gsi,
6851                                gimple **vec_stmt, slp_tree slp_node,
6852                                gimple *reduc_def_stmt,
6853                                tree_code code, internal_fn reduc_fn,
6854                                tree ops[3], tree vectype_in,
6855                                int reduc_index, vec_loop_masks *masks,
6856                                vec_loop_lens *lens)
6857 {
6858   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6859   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6860   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6861
6862   int ncopies;
6863   if (slp_node)
6864     ncopies = 1;
6865   else
6866     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6867
6868   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6869   gcc_assert (ncopies == 1);
6870   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6871
6872   if (slp_node)
6873     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6874                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6875
6876   tree op0 = ops[1 - reduc_index];
6877
6878   int group_size = 1;
6879   stmt_vec_info scalar_dest_def_info;
6880   auto_vec<tree> vec_oprnds0;
6881   if (slp_node)
6882     {
6883       auto_vec<vec<tree> > vec_defs (2);
6884       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6885       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6886       vec_defs[0].release ();
6887       vec_defs[1].release ();
6888       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6889       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6890     }
6891   else
6892     {
6893       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6894                                      op0, &vec_oprnds0);
6895       scalar_dest_def_info = stmt_info;
6896     }
6897
6898   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6899   tree scalar_type = TREE_TYPE (scalar_dest);
6900   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6901
6902   int vec_num = vec_oprnds0.length ();
6903   gcc_assert (vec_num == 1 || slp_node);
6904   tree vec_elem_type = TREE_TYPE (vectype_out);
6905   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6906
6907   tree vector_identity = NULL_TREE;
6908   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6909     {
6910       vector_identity = build_zero_cst (vectype_out);
6911       if (!HONOR_SIGNED_ZEROS (vectype_out))
6912         ;
6913       else
6914         {
6915           gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
6916           vector_identity = const_unop (NEGATE_EXPR, vectype_out,
6917                                         vector_identity);
6918         }
6919     }
6920
6921   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6922   int i;
6923   tree def0;
6924   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6925     {
6926       gimple *new_stmt;
6927       tree mask = NULL_TREE;
6928       tree len = NULL_TREE;
6929       tree bias = NULL_TREE;
6930       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6931         mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
6932       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
6933         {
6934           len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
6935                                    i, 1);
6936           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
6937           bias = build_int_cst (intQI_type_node, biasval);
6938           mask = build_minus_one_cst (truth_type_for (vectype_in));
6939         }
6940
6941       /* Handle MINUS by adding the negative.  */
6942       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6943         {
6944           tree negated = make_ssa_name (vectype_out);
6945           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6946           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6947           def0 = negated;
6948         }
6949
6950       if (mask && mask_reduc_fn == IFN_LAST)
6951         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6952                                     vector_identity);
6953
6954       /* On the first iteration the input is simply the scalar phi
6955          result, and for subsequent iterations it is the output of
6956          the preceding operation.  */
6957       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6958         {
6959           if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
6960             new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
6961                                                    def0, mask, len, bias);
6962           else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
6963             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6964                                                    def0, mask);
6965           else
6966             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6967                                                    def0);
6968           /* For chained SLP reductions the output of the previous reduction
6969              operation serves as the input of the next. For the final statement
6970              the output cannot be a temporary - we reuse the original
6971              scalar destination of the last statement.  */
6972           if (i != vec_num - 1)
6973             {
6974               gimple_set_lhs (new_stmt, scalar_dest_var);
6975               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6976               gimple_set_lhs (new_stmt, reduc_var);
6977             }
6978         }
6979       else
6980         {
6981           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6982                                              reduc_var, def0);
6983           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6984           /* Remove the statement, so that we can use the same code paths
6985              as for statements that we've just created.  */
6986           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6987           gsi_remove (&tmp_gsi, true);
6988         }
6989
6990       if (i == vec_num - 1)
6991         {
6992           gimple_set_lhs (new_stmt, scalar_dest);
6993           vect_finish_replace_stmt (loop_vinfo,
6994                                     scalar_dest_def_info,
6995                                     new_stmt);
6996         }
6997       else
6998         vect_finish_stmt_generation (loop_vinfo,
6999                                      scalar_dest_def_info,
7000                                      new_stmt, gsi);
7001
7002       if (slp_node)
7003         slp_node->push_vec_def (new_stmt);
7004       else
7005         {
7006           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7007           *vec_stmt = new_stmt;
7008         }
7009     }
7010
7011   return true;
7012 }
7013
7014 /* Function is_nonwrapping_integer_induction.
7015
7016    Check if STMT_VINO (which is part of loop LOOP) both increments and
7017    does not cause overflow.  */
7018
7019 static bool
7020 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7021 {
7022   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7023   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7024   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7025   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7026   widest_int ni, max_loop_value, lhs_max;
7027   wi::overflow_type overflow = wi::OVF_NONE;
7028
7029   /* Make sure the loop is integer based.  */
7030   if (TREE_CODE (base) != INTEGER_CST
7031       || TREE_CODE (step) != INTEGER_CST)
7032     return false;
7033
7034   /* Check that the max size of the loop will not wrap.  */
7035
7036   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7037     return true;
7038
7039   if (! max_stmt_executions (loop, &ni))
7040     return false;
7041
7042   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7043                             &overflow);
7044   if (overflow)
7045     return false;
7046
7047   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7048                             TYPE_SIGN (lhs_type), &overflow);
7049   if (overflow)
7050     return false;
7051
7052   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7053           <= TYPE_PRECISION (lhs_type));
7054 }
7055
7056 /* Check if masking can be supported by inserting a conditional expression.
7057    CODE is the code for the operation.  COND_FN is the conditional internal
7058    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
7059 static bool
7060 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7061                          tree vectype_in)
7062 {
7063   if (cond_fn != IFN_LAST
7064       && direct_internal_fn_supported_p (cond_fn, vectype_in,
7065                                          OPTIMIZE_FOR_SPEED))
7066     return false;
7067
7068   if (code.is_tree_code ())
7069     switch (tree_code (code))
7070       {
7071       case DOT_PROD_EXPR:
7072       case SAD_EXPR:
7073         return true;
7074
7075       default:
7076         break;
7077       }
7078   return false;
7079 }
7080
7081 /* Insert a conditional expression to enable masked vectorization.  CODE is the
7082    code for the operation.  VOP is the array of operands.  MASK is the loop
7083    mask.  GSI is a statement iterator used to place the new conditional
7084    expression.  */
7085 static void
7086 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7087                       gimple_stmt_iterator *gsi)
7088 {
7089   switch (tree_code (code))
7090     {
7091     case DOT_PROD_EXPR:
7092       {
7093         tree vectype = TREE_TYPE (vop[1]);
7094         tree zero = build_zero_cst (vectype);
7095         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7096         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7097                                                mask, vop[1], zero);
7098         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7099         vop[1] = masked_op1;
7100         break;
7101       }
7102
7103     case SAD_EXPR:
7104       {
7105         tree vectype = TREE_TYPE (vop[1]);
7106         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7107         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7108                                                mask, vop[1], vop[0]);
7109         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7110         vop[1] = masked_op1;
7111         break;
7112       }
7113
7114     default:
7115       gcc_unreachable ();
7116     }
7117 }
7118
7119 /* Function vectorizable_reduction.
7120
7121    Check if STMT_INFO performs a reduction operation that can be vectorized.
7122    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7123    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7124    Return true if STMT_INFO is vectorizable in this way.
7125
7126    This function also handles reduction idioms (patterns) that have been
7127    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
7128    may be of this form:
7129      X = pattern_expr (arg0, arg1, ..., X)
7130    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7131    sequence that had been detected and replaced by the pattern-stmt
7132    (STMT_INFO).
7133
7134    This function also handles reduction of condition expressions, for example:
7135      for (int i = 0; i < N; i++)
7136        if (a[i] < value)
7137          last = a[i];
7138    This is handled by vectorising the loop and creating an additional vector
7139    containing the loop indexes for which "a[i] < value" was true.  In the
7140    function epilogue this is reduced to a single max value and then used to
7141    index into the vector of results.
7142
7143    In some cases of reduction patterns, the type of the reduction variable X is
7144    different than the type of the other arguments of STMT_INFO.
7145    In such cases, the vectype that is used when transforming STMT_INFO into
7146    a vector stmt is different than the vectype that is used to determine the
7147    vectorization factor, because it consists of a different number of elements
7148    than the actual number of elements that are being operated upon in parallel.
7149
7150    For example, consider an accumulation of shorts into an int accumulator.
7151    On some targets it's possible to vectorize this pattern operating on 8
7152    shorts at a time (hence, the vectype for purposes of determining the
7153    vectorization factor should be V8HI); on the other hand, the vectype that
7154    is used to create the vector form is actually V4SI (the type of the result).
7155
7156    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7157    indicates what is the actual level of parallelism (V8HI in the example), so
7158    that the right vectorization factor would be derived.  This vectype
7159    corresponds to the type of arguments to the reduction stmt, and should *NOT*
7160    be used to create the vectorized stmt.  The right vectype for the vectorized
7161    stmt is obtained from the type of the result X:
7162       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7163
7164    This means that, contrary to "regular" reductions (or "regular" stmts in
7165    general), the following equation:
7166       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7167    does *NOT* necessarily hold for reduction patterns.  */
7168
7169 bool
7170 vectorizable_reduction (loop_vec_info loop_vinfo,
7171                         stmt_vec_info stmt_info, slp_tree slp_node,
7172                         slp_instance slp_node_instance,
7173                         stmt_vector_for_cost *cost_vec)
7174 {
7175   tree vectype_in = NULL_TREE;
7176   tree vectype_op[3] = { NULL_TREE, NULL_TREE, NULL_TREE };
7177   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7178   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7179   stmt_vec_info cond_stmt_vinfo = NULL;
7180   int i;
7181   int ncopies;
7182   bool single_defuse_cycle = false;
7183   bool nested_cycle = false;
7184   bool double_reduc = false;
7185   int vec_num;
7186   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7187   tree cond_reduc_val = NULL_TREE;
7188
7189   /* Make sure it was already recognized as a reduction computation.  */
7190   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7191       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7192       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7193     return false;
7194
7195   /* The stmt we store reduction analysis meta on.  */
7196   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7197   reduc_info->is_reduc_info = true;
7198
7199   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7200     {
7201       if (is_a <gphi *> (stmt_info->stmt))
7202         {
7203           if (slp_node)
7204             {
7205               /* We eventually need to set a vector type on invariant
7206                  arguments.  */
7207               unsigned j;
7208               slp_tree child;
7209               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7210                 if (!vect_maybe_update_slp_op_vectype
7211                        (child, SLP_TREE_VECTYPE (slp_node)))
7212                   {
7213                     if (dump_enabled_p ())
7214                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7215                                        "incompatible vector types for "
7216                                        "invariants\n");
7217                     return false;
7218                   }
7219             }
7220           /* Analysis for double-reduction is done on the outer
7221              loop PHI, nested cycles have no further restrictions.  */
7222           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7223         }
7224       else
7225         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7226       return true;
7227     }
7228
7229   stmt_vec_info orig_stmt_of_analysis = stmt_info;
7230   stmt_vec_info phi_info = stmt_info;
7231   if (!is_a <gphi *> (stmt_info->stmt))
7232     {
7233       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7234       return true;
7235     }
7236   if (slp_node)
7237     {
7238       slp_node_instance->reduc_phis = slp_node;
7239       /* ???  We're leaving slp_node to point to the PHIs, we only
7240          need it to get at the number of vector stmts which wasn't
7241          yet initialized for the instance root.  */
7242     }
7243   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7244     {
7245       use_operand_p use_p;
7246       gimple *use_stmt;
7247       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7248                                  &use_p, &use_stmt);
7249       gcc_assert (res);
7250       phi_info = loop_vinfo->lookup_stmt (use_stmt);
7251     }
7252
7253   /* PHIs should not participate in patterns.  */
7254   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7255   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7256
7257   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7258      and compute the reduction chain length.  Discover the real
7259      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
7260   tree reduc_def
7261     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7262                              loop_latch_edge
7263                                (gimple_bb (reduc_def_phi)->loop_father));
7264   unsigned reduc_chain_length = 0;
7265   bool only_slp_reduc_chain = true;
7266   stmt_info = NULL;
7267   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7268   while (reduc_def != PHI_RESULT (reduc_def_phi))
7269     {
7270       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7271       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7272       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7273         {
7274           if (dump_enabled_p ())
7275             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7276                              "reduction chain broken by patterns.\n");
7277           return false;
7278         }
7279       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7280         only_slp_reduc_chain = false;
7281       /* For epilogue generation live members of the chain need
7282          to point back to the PHI via their original stmt for
7283          info_for_reduction to work.  For SLP we need to look at
7284          all lanes here - even though we only will vectorize from
7285          the SLP node with live lane zero the other live lanes also
7286          need to be identified as part of a reduction to be able
7287          to skip code generation for them.  */
7288       if (slp_for_stmt_info)
7289         {
7290           for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7291             if (STMT_VINFO_LIVE_P (s))
7292               STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7293         }
7294       else if (STMT_VINFO_LIVE_P (vdef))
7295         STMT_VINFO_REDUC_DEF (def) = phi_info;
7296       gimple_match_op op;
7297       if (!gimple_extract_op (vdef->stmt, &op))
7298         {
7299           if (dump_enabled_p ())
7300             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7301                              "reduction chain includes unsupported"
7302                              " statement type.\n");
7303           return false;
7304         }
7305       if (CONVERT_EXPR_CODE_P (op.code))
7306         {
7307           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7308             {
7309               if (dump_enabled_p ())
7310                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7311                                  "conversion in the reduction chain.\n");
7312               return false;
7313             }
7314         }
7315       else if (!stmt_info)
7316         /* First non-conversion stmt.  */
7317         stmt_info = vdef;
7318       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7319       reduc_chain_length++;
7320       if (!stmt_info && slp_node)
7321         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7322     }
7323   /* PHIs should not participate in patterns.  */
7324   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7325
7326   if (nested_in_vect_loop_p (loop, stmt_info))
7327     {
7328       loop = loop->inner;
7329       nested_cycle = true;
7330     }
7331
7332   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7333      element.  */
7334   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7335     {
7336       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7337       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7338     }
7339   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7340     gcc_assert (slp_node
7341                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7342
7343   /* 1. Is vectorizable reduction?  */
7344   /* Not supportable if the reduction variable is used in the loop, unless
7345      it's a reduction chain.  */
7346   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7347       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7348     return false;
7349
7350   /* Reductions that are not used even in an enclosing outer-loop,
7351      are expected to be "live" (used out of the loop).  */
7352   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7353       && !STMT_VINFO_LIVE_P (stmt_info))
7354     return false;
7355
7356   /* 2. Has this been recognized as a reduction pattern?
7357
7358      Check if STMT represents a pattern that has been recognized
7359      in earlier analysis stages.  For stmts that represent a pattern,
7360      the STMT_VINFO_RELATED_STMT field records the last stmt in
7361      the original sequence that constitutes the pattern.  */
7362
7363   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7364   if (orig_stmt_info)
7365     {
7366       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7367       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7368     }
7369
7370   /* 3. Check the operands of the operation.  The first operands are defined
7371         inside the loop body. The last operand is the reduction variable,
7372         which is defined by the loop-header-phi.  */
7373
7374   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7375   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7376   gimple_match_op op;
7377   if (!gimple_extract_op (stmt_info->stmt, &op))
7378     gcc_unreachable ();
7379   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7380                             || op.code == WIDEN_SUM_EXPR
7381                             || op.code == SAD_EXPR);
7382
7383   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7384       && !SCALAR_FLOAT_TYPE_P (op.type))
7385     return false;
7386
7387   /* Do not try to vectorize bit-precision reductions.  */
7388   if (!type_has_mode_precision_p (op.type))
7389     return false;
7390
7391   /* For lane-reducing ops we're reducing the number of reduction PHIs
7392      which means the only use of that may be in the lane-reducing operation.  */
7393   if (lane_reduc_code_p
7394       && reduc_chain_length != 1
7395       && !only_slp_reduc_chain)
7396     {
7397       if (dump_enabled_p ())
7398         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7399                          "lane-reducing reduction with extra stmts.\n");
7400       return false;
7401     }
7402
7403   /* All uses but the last are expected to be defined in the loop.
7404      The last use is the reduction variable.  In case of nested cycle this
7405      assumption is not true: we use reduc_index to record the index of the
7406      reduction variable.  */
7407   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7408   /* We need to skip an extra operand for COND_EXPRs with embedded
7409      comparison.  */
7410   unsigned opno_adjust = 0;
7411   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7412     opno_adjust = 1;
7413   for (i = 0; i < (int) op.num_ops; i++)
7414     {
7415       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
7416       if (i == 0 && op.code == COND_EXPR)
7417         continue;
7418
7419       stmt_vec_info def_stmt_info;
7420       enum vect_def_type dt;
7421       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7422                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7423                                &vectype_op[i], &def_stmt_info))
7424         {
7425           if (dump_enabled_p ())
7426             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7427                              "use not simple.\n");
7428           return false;
7429         }
7430       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7431         continue;
7432
7433       /* There should be only one cycle def in the stmt, the one
7434          leading to reduc_def.  */
7435       if (VECTORIZABLE_CYCLE_DEF (dt))
7436         return false;
7437
7438       if (!vectype_op[i])
7439         vectype_op[i]
7440           = get_vectype_for_scalar_type (loop_vinfo,
7441                                          TREE_TYPE (op.ops[i]), slp_op[i]);
7442
7443       /* To properly compute ncopies we are interested in the widest
7444          non-reduction input type in case we're looking at a widening
7445          accumulation that we later handle in vect_transform_reduction.  */
7446       if (lane_reduc_code_p
7447           && vectype_op[i]
7448           && (!vectype_in
7449               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7450                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7451         vectype_in = vectype_op[i];
7452
7453       if (op.code == COND_EXPR)
7454         {
7455           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
7456           if (dt == vect_constant_def)
7457             {
7458               cond_reduc_dt = dt;
7459               cond_reduc_val = op.ops[i];
7460             }
7461           if (dt == vect_induction_def
7462               && def_stmt_info
7463               && is_nonwrapping_integer_induction (def_stmt_info, loop))
7464             {
7465               cond_reduc_dt = dt;
7466               cond_stmt_vinfo = def_stmt_info;
7467             }
7468         }
7469     }
7470   if (!vectype_in)
7471     vectype_in = STMT_VINFO_VECTYPE (phi_info);
7472   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7473
7474   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7475   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7476   /* If we have a condition reduction, see if we can simplify it further.  */
7477   if (v_reduc_type == COND_REDUCTION)
7478     {
7479       if (slp_node)
7480         return false;
7481
7482       /* When the condition uses the reduction value in the condition, fail.  */
7483       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7484         {
7485           if (dump_enabled_p ())
7486             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7487                              "condition depends on previous iteration\n");
7488           return false;
7489         }
7490
7491       if (reduc_chain_length == 1
7492           && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
7493                                              vectype_in, OPTIMIZE_FOR_SPEED))
7494         {
7495           if (dump_enabled_p ())
7496             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7497                              "optimizing condition reduction with"
7498                              " FOLD_EXTRACT_LAST.\n");
7499           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7500         }
7501       else if (cond_reduc_dt == vect_induction_def)
7502         {
7503           tree base
7504             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7505           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7506
7507           gcc_assert (TREE_CODE (base) == INTEGER_CST
7508                       && TREE_CODE (step) == INTEGER_CST);
7509           cond_reduc_val = NULL_TREE;
7510           enum tree_code cond_reduc_op_code = ERROR_MARK;
7511           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7512           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7513             ;
7514           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7515              above base; punt if base is the minimum value of the type for
7516              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
7517           else if (tree_int_cst_sgn (step) == -1)
7518             {
7519               cond_reduc_op_code = MIN_EXPR;
7520               if (tree_int_cst_sgn (base) == -1)
7521                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7522               else if (tree_int_cst_lt (base,
7523                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
7524                 cond_reduc_val
7525                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
7526             }
7527           else
7528             {
7529               cond_reduc_op_code = MAX_EXPR;
7530               if (tree_int_cst_sgn (base) == 1)
7531                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7532               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7533                                         base))
7534                 cond_reduc_val
7535                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
7536             }
7537           if (cond_reduc_val)
7538             {
7539               if (dump_enabled_p ())
7540                 dump_printf_loc (MSG_NOTE, vect_location,
7541                                  "condition expression based on "
7542                                  "integer induction.\n");
7543               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7544               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7545                 = cond_reduc_val;
7546               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7547             }
7548         }
7549       else if (cond_reduc_dt == vect_constant_def)
7550         {
7551           enum vect_def_type cond_initial_dt;
7552           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7553           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7554           if (cond_initial_dt == vect_constant_def
7555               && types_compatible_p (TREE_TYPE (cond_initial_val),
7556                                      TREE_TYPE (cond_reduc_val)))
7557             {
7558               tree e = fold_binary (LE_EXPR, boolean_type_node,
7559                                     cond_initial_val, cond_reduc_val);
7560               if (e && (integer_onep (e) || integer_zerop (e)))
7561                 {
7562                   if (dump_enabled_p ())
7563                     dump_printf_loc (MSG_NOTE, vect_location,
7564                                      "condition expression based on "
7565                                      "compile time constant.\n");
7566                   /* Record reduction code at analysis stage.  */
7567                   STMT_VINFO_REDUC_CODE (reduc_info)
7568                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7569                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7570                 }
7571             }
7572         }
7573     }
7574
7575   if (STMT_VINFO_LIVE_P (phi_info))
7576     return false;
7577
7578   if (slp_node)
7579     ncopies = 1;
7580   else
7581     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7582
7583   gcc_assert (ncopies >= 1);
7584
7585   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7586
7587   if (nested_cycle)
7588     {
7589       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7590                   == vect_double_reduction_def);
7591       double_reduc = true;
7592     }
7593
7594   /* 4.2. Check support for the epilog operation.
7595
7596           If STMT represents a reduction pattern, then the type of the
7597           reduction variable may be different than the type of the rest
7598           of the arguments.  For example, consider the case of accumulation
7599           of shorts into an int accumulator; The original code:
7600                         S1: int_a = (int) short_a;
7601           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7602
7603           was replaced with:
7604                         STMT: int_acc = widen_sum <short_a, int_acc>
7605
7606           This means that:
7607           1. The tree-code that is used to create the vector operation in the
7608              epilog code (that reduces the partial results) is not the
7609              tree-code of STMT, but is rather the tree-code of the original
7610              stmt from the pattern that STMT is replacing.  I.e, in the example
7611              above we want to use 'widen_sum' in the loop, but 'plus' in the
7612              epilog.
7613           2. The type (mode) we use to check available target support
7614              for the vector operation to be created in the *epilog*, is
7615              determined by the type of the reduction variable (in the example
7616              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7617              However the type (mode) we use to check available target support
7618              for the vector operation to be created *inside the loop*, is
7619              determined by the type of the other arguments to STMT (in the
7620              example we'd check this: optab_handler (widen_sum_optab,
7621              vect_short_mode)).
7622
7623           This is contrary to "regular" reductions, in which the types of all
7624           the arguments are the same as the type of the reduction variable.
7625           For "regular" reductions we can therefore use the same vector type
7626           (and also the same tree-code) when generating the epilog code and
7627           when generating the code inside the loop.  */
7628
7629   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7630   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7631
7632   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7633   if (reduction_type == TREE_CODE_REDUCTION)
7634     {
7635       /* Check whether it's ok to change the order of the computation.
7636          Generally, when vectorizing a reduction we change the order of the
7637          computation.  This may change the behavior of the program in some
7638          cases, so we need to check that this is ok.  One exception is when
7639          vectorizing an outer-loop: the inner-loop is executed sequentially,
7640          and therefore vectorizing reductions in the inner-loop during
7641          outer-loop vectorization is safe.  Likewise when we are vectorizing
7642          a series of reductions using SLP and the VF is one the reductions
7643          are performed in scalar order.  */
7644       if (slp_node
7645           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7646           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7647         ;
7648       else if (needs_fold_left_reduction_p (op.type, orig_code))
7649         {
7650           /* When vectorizing a reduction chain w/o SLP the reduction PHI
7651              is not directy used in stmt.  */
7652           if (!only_slp_reduc_chain
7653               && reduc_chain_length != 1)
7654             {
7655               if (dump_enabled_p ())
7656                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7657                                  "in-order reduction chain without SLP.\n");
7658               return false;
7659             }
7660           STMT_VINFO_REDUC_TYPE (reduc_info)
7661             = reduction_type = FOLD_LEFT_REDUCTION;
7662         }
7663       else if (!commutative_binary_op_p (orig_code, op.type)
7664                || !associative_binary_op_p (orig_code, op.type))
7665         {
7666           if (dump_enabled_p ())
7667             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7668                             "reduction: not commutative/associative");
7669           return false;
7670         }
7671     }
7672
7673   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7674       && ncopies > 1)
7675     {
7676       if (dump_enabled_p ())
7677         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7678                          "multiple types in double reduction or condition "
7679                          "reduction or fold-left reduction.\n");
7680       return false;
7681     }
7682
7683   internal_fn reduc_fn = IFN_LAST;
7684   if (reduction_type == TREE_CODE_REDUCTION
7685       || reduction_type == FOLD_LEFT_REDUCTION
7686       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7687       || reduction_type == CONST_COND_REDUCTION)
7688     {
7689       if (reduction_type == FOLD_LEFT_REDUCTION
7690           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7691           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7692         {
7693           if (reduc_fn != IFN_LAST
7694               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7695                                                   OPTIMIZE_FOR_SPEED))
7696             {
7697               if (dump_enabled_p ())
7698                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7699                                  "reduc op not supported by target.\n");
7700
7701               reduc_fn = IFN_LAST;
7702             }
7703         }
7704       else
7705         {
7706           if (!nested_cycle || double_reduc)
7707             {
7708               if (dump_enabled_p ())
7709                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7710                                  "no reduc code for scalar code.\n");
7711
7712               return false;
7713             }
7714         }
7715     }
7716   else if (reduction_type == COND_REDUCTION)
7717     {
7718       int scalar_precision
7719         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7720       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7721       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7722                                                 vectype_out);
7723
7724       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7725                                           OPTIMIZE_FOR_SPEED))
7726         reduc_fn = IFN_REDUC_MAX;
7727     }
7728   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7729
7730   if (reduction_type != EXTRACT_LAST_REDUCTION
7731       && (!nested_cycle || double_reduc)
7732       && reduc_fn == IFN_LAST
7733       && !nunits_out.is_constant ())
7734     {
7735       if (dump_enabled_p ())
7736         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7737                          "missing target support for reduction on"
7738                          " variable-length vectors.\n");
7739       return false;
7740     }
7741
7742   /* For SLP reductions, see if there is a neutral value we can use.  */
7743   tree neutral_op = NULL_TREE;
7744   if (slp_node)
7745     {
7746       tree initial_value = NULL_TREE;
7747       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7748         initial_value = vect_phi_initial_value (reduc_def_phi);
7749       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7750                                              orig_code, initial_value);
7751     }
7752
7753   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7754     {
7755       /* We can't support in-order reductions of code such as this:
7756
7757            for (int i = 0; i < n1; ++i)
7758              for (int j = 0; j < n2; ++j)
7759                l += a[j];
7760
7761          since GCC effectively transforms the loop when vectorizing:
7762
7763            for (int i = 0; i < n1 / VF; ++i)
7764              for (int j = 0; j < n2; ++j)
7765                for (int k = 0; k < VF; ++k)
7766                  l += a[j];
7767
7768          which is a reassociation of the original operation.  */
7769       if (dump_enabled_p ())
7770         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7771                          "in-order double reduction not supported.\n");
7772
7773       return false;
7774     }
7775
7776   if (reduction_type == FOLD_LEFT_REDUCTION
7777       && slp_node
7778       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7779     {
7780       /* We cannot use in-order reductions in this case because there is
7781          an implicit reassociation of the operations involved.  */
7782       if (dump_enabled_p ())
7783         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7784                          "in-order unchained SLP reductions not supported.\n");
7785       return false;
7786     }
7787
7788   /* For double reductions, and for SLP reductions with a neutral value,
7789      we construct a variable-length initial vector by loading a vector
7790      full of the neutral value and then shift-and-inserting the start
7791      values into the low-numbered elements.  */
7792   if ((double_reduc || neutral_op)
7793       && !nunits_out.is_constant ()
7794       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7795                                           vectype_out, OPTIMIZE_FOR_SPEED))
7796     {
7797       if (dump_enabled_p ())
7798         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7799                          "reduction on variable-length vectors requires"
7800                          " target support for a vector-shift-and-insert"
7801                          " operation.\n");
7802       return false;
7803     }
7804
7805   /* Check extra constraints for variable-length unchained SLP reductions.  */
7806   if (slp_node
7807       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7808       && !nunits_out.is_constant ())
7809     {
7810       /* We checked above that we could build the initial vector when
7811          there's a neutral element value.  Check here for the case in
7812          which each SLP statement has its own initial value and in which
7813          that value needs to be repeated for every instance of the
7814          statement within the initial vector.  */
7815       unsigned int group_size = SLP_TREE_LANES (slp_node);
7816       if (!neutral_op
7817           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7818                                               TREE_TYPE (vectype_out)))
7819         {
7820           if (dump_enabled_p ())
7821             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7822                              "unsupported form of SLP reduction for"
7823                              " variable-length vectors: cannot build"
7824                              " initial vector.\n");
7825           return false;
7826         }
7827       /* The epilogue code relies on the number of elements being a multiple
7828          of the group size.  The duplicate-and-interleave approach to setting
7829          up the initial vector does too.  */
7830       if (!multiple_p (nunits_out, group_size))
7831         {
7832           if (dump_enabled_p ())
7833             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7834                              "unsupported form of SLP reduction for"
7835                              " variable-length vectors: the vector size"
7836                              " is not a multiple of the number of results.\n");
7837           return false;
7838         }
7839     }
7840
7841   if (reduction_type == COND_REDUCTION)
7842     {
7843       widest_int ni;
7844
7845       if (! max_loop_iterations (loop, &ni))
7846         {
7847           if (dump_enabled_p ())
7848             dump_printf_loc (MSG_NOTE, vect_location,
7849                              "loop count not known, cannot create cond "
7850                              "reduction.\n");
7851           return false;
7852         }
7853       /* Convert backedges to iterations.  */
7854       ni += 1;
7855
7856       /* The additional index will be the same type as the condition.  Check
7857          that the loop can fit into this less one (because we'll use up the
7858          zero slot for when there are no matches).  */
7859       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7860       if (wi::geu_p (ni, wi::to_widest (max_index)))
7861         {
7862           if (dump_enabled_p ())
7863             dump_printf_loc (MSG_NOTE, vect_location,
7864                              "loop size is greater than data size.\n");
7865           return false;
7866         }
7867     }
7868
7869   /* In case the vectorization factor (VF) is bigger than the number
7870      of elements that we can fit in a vectype (nunits), we have to generate
7871      more than one vector stmt - i.e - we need to "unroll" the
7872      vector stmt by a factor VF/nunits.  For more details see documentation
7873      in vectorizable_operation.  */
7874
7875   /* If the reduction is used in an outer loop we need to generate
7876      VF intermediate results, like so (e.g. for ncopies=2):
7877         r0 = phi (init, r0)
7878         r1 = phi (init, r1)
7879         r0 = x0 + r0;
7880         r1 = x1 + r1;
7881     (i.e. we generate VF results in 2 registers).
7882     In this case we have a separate def-use cycle for each copy, and therefore
7883     for each copy we get the vector def for the reduction variable from the
7884     respective phi node created for this copy.
7885
7886     Otherwise (the reduction is unused in the loop nest), we can combine
7887     together intermediate results, like so (e.g. for ncopies=2):
7888         r = phi (init, r)
7889         r = x0 + r;
7890         r = x1 + r;
7891    (i.e. we generate VF/2 results in a single register).
7892    In this case for each copy we get the vector def for the reduction variable
7893    from the vectorized reduction operation generated in the previous iteration.
7894
7895    This only works when we see both the reduction PHI and its only consumer
7896    in vectorizable_reduction and there are no intermediate stmts
7897    participating.  When unrolling we want each unrolled iteration to have its
7898    own reduction accumulator since one of the main goals of unrolling a
7899    reduction is to reduce the aggregate loop-carried latency.  */
7900   if (ncopies > 1
7901       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7902       && reduc_chain_length == 1
7903       && loop_vinfo->suggested_unroll_factor == 1)
7904     single_defuse_cycle = true;
7905
7906   if (single_defuse_cycle || lane_reduc_code_p)
7907     {
7908       gcc_assert (op.code != COND_EXPR);
7909
7910       /* 4. Supportable by target?  */
7911       bool ok = true;
7912
7913       /* 4.1. check support for the operation in the loop
7914
7915          This isn't necessary for the lane reduction codes, since they
7916          can only be produced by pattern matching, and it's up to the
7917          pattern matcher to test for support.  The main reason for
7918          specifically skipping this step is to avoid rechecking whether
7919          mixed-sign dot-products can be implemented using signed
7920          dot-products.  */
7921       machine_mode vec_mode = TYPE_MODE (vectype_in);
7922       if (!lane_reduc_code_p
7923           && !directly_supported_p (op.code, vectype_in, optab_vector))
7924         {
7925           if (dump_enabled_p ())
7926             dump_printf (MSG_NOTE, "op not supported by target.\n");
7927           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7928               || !vect_can_vectorize_without_simd_p (op.code))
7929             ok = false;
7930           else
7931             if (dump_enabled_p ())
7932               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7933         }
7934
7935       if (vect_emulated_vector_p (vectype_in)
7936           && !vect_can_vectorize_without_simd_p (op.code))
7937         {
7938           if (dump_enabled_p ())
7939             dump_printf (MSG_NOTE, "using word mode not possible.\n");
7940           return false;
7941         }
7942
7943       /* lane-reducing operations have to go through vect_transform_reduction.
7944          For the other cases try without the single cycle optimization.  */
7945       if (!ok)
7946         {
7947           if (lane_reduc_code_p)
7948             return false;
7949           else
7950             single_defuse_cycle = false;
7951         }
7952     }
7953   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7954
7955   /* If the reduction stmt is one of the patterns that have lane
7956      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7957   if ((ncopies > 1 && ! single_defuse_cycle)
7958       && lane_reduc_code_p)
7959     {
7960       if (dump_enabled_p ())
7961         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7962                          "multi def-use cycle not possible for lane-reducing "
7963                          "reduction operation\n");
7964       return false;
7965     }
7966
7967   if (slp_node
7968       && !(!single_defuse_cycle
7969            && !lane_reduc_code_p
7970            && reduction_type != FOLD_LEFT_REDUCTION))
7971     for (i = 0; i < (int) op.num_ops; i++)
7972       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
7973         {
7974           if (dump_enabled_p ())
7975             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7976                              "incompatible vector types for invariants\n");
7977           return false;
7978         }
7979
7980   if (slp_node)
7981     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7982   else
7983     vec_num = 1;
7984
7985   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7986                              reduction_type, ncopies, cost_vec);
7987   /* Cost the reduction op inside the loop if transformed via
7988      vect_transform_reduction.  Otherwise this is costed by the
7989      separate vectorizable_* routines.  */
7990   if (single_defuse_cycle || lane_reduc_code_p)
7991     {
7992       int factor = 1;
7993       if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
7994         /* Three dot-products and a subtraction.  */
7995         factor = 4;
7996       record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
7997                         stmt_info, 0, vect_body);
7998     }
7999
8000   if (dump_enabled_p ()
8001       && reduction_type == FOLD_LEFT_REDUCTION)
8002     dump_printf_loc (MSG_NOTE, vect_location,
8003                      "using an in-order (fold-left) reduction.\n");
8004   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8005   /* All but single defuse-cycle optimized, lane-reducing and fold-left
8006      reductions go through their own vectorizable_* routines.  */
8007   if (!single_defuse_cycle
8008       && !lane_reduc_code_p
8009       && reduction_type != FOLD_LEFT_REDUCTION)
8010     {
8011       stmt_vec_info tem
8012         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8013       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8014         {
8015           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8016           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8017         }
8018       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8019       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8020     }
8021   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8022     {
8023       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8024       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8025       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8026
8027       if (reduction_type != FOLD_LEFT_REDUCTION
8028           && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8029           && (cond_fn == IFN_LAST
8030               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8031                                                   OPTIMIZE_FOR_SPEED)))
8032         {
8033           if (dump_enabled_p ())
8034             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8035                              "can't operate on partial vectors because"
8036                              " no conditional operation is available.\n");
8037           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8038         }
8039       else if (reduction_type == FOLD_LEFT_REDUCTION
8040                && reduc_fn == IFN_LAST
8041                && !expand_vec_cond_expr_p (vectype_in,
8042                                            truth_type_for (vectype_in),
8043                                            SSA_NAME))
8044         {
8045           if (dump_enabled_p ())
8046             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8047                              "can't operate on partial vectors because"
8048                              " no conditional operation is available.\n");
8049           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8050         }
8051       else if (reduction_type == FOLD_LEFT_REDUCTION
8052                && reduc_fn == IFN_LAST
8053                && FLOAT_TYPE_P (vectype_in)
8054                && HONOR_SIGNED_ZEROS (vectype_in)
8055                && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8056         {
8057           if (dump_enabled_p ())
8058             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8059                              "can't operate on partial vectors because"
8060                              " signed zeros cannot be preserved.\n");
8061           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8062         }
8063       else
8064         {
8065           internal_fn mask_reduc_fn
8066             = get_masked_reduction_fn (reduc_fn, vectype_in);
8067
8068           if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8069             vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8070                                   vectype_in, 1);
8071           else
8072             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8073                                    vectype_in, NULL);
8074         }
8075     }
8076   return true;
8077 }
8078
8079 /* STMT_INFO is a dot-product reduction whose multiplication operands
8080    have different signs.  Emit a sequence to emulate the operation
8081    using a series of signed DOT_PROD_EXPRs and return the last
8082    statement generated.  VEC_DEST is the result of the vector operation
8083    and VOP lists its inputs.  */
8084
8085 static gassign *
8086 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8087                              gimple_stmt_iterator *gsi, tree vec_dest,
8088                              tree vop[3])
8089 {
8090   tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8091   tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8092   tree narrow_elttype = TREE_TYPE (narrow_vectype);
8093   gimple *new_stmt;
8094
8095   /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
8096   if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8097     std::swap (vop[0], vop[1]);
8098
8099   /* Convert all inputs to signed types.  */
8100   for (int i = 0; i < 3; ++i)
8101     if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8102       {
8103         tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8104         new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8105         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8106         vop[i] = tmp;
8107       }
8108
8109   /* In the comments below we assume 8-bit inputs for simplicity,
8110      but the approach works for any full integer type.  */
8111
8112   /* Create a vector of -128.  */
8113   tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8114   tree min_narrow = build_vector_from_val (narrow_vectype,
8115                                            min_narrow_elttype);
8116
8117   /* Create a vector of 64.  */
8118   auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8119   tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8120   half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8121
8122   /* Emit: SUB_RES = VOP[0] - 128.  */
8123   tree sub_res = make_ssa_name (narrow_vectype);
8124   new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8125   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8126
8127   /* Emit:
8128
8129        STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8130        STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8131        STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8132
8133      on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8134      Doing the two 64 * y steps first allows more time to compute x.  */
8135   tree stage1 = make_ssa_name (wide_vectype);
8136   new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8137                                   vop[1], half_narrow, vop[2]);
8138   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8139
8140   tree stage2 = make_ssa_name (wide_vectype);
8141   new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8142                                   vop[1], half_narrow, stage1);
8143   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8144
8145   tree stage3 = make_ssa_name (wide_vectype);
8146   new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8147                                   sub_res, vop[1], stage2);
8148   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8149
8150   /* Convert STAGE3 to the reduction type.  */
8151   return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8152 }
8153
8154 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8155    value.  */
8156
8157 bool
8158 vect_transform_reduction (loop_vec_info loop_vinfo,
8159                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8160                           gimple **vec_stmt, slp_tree slp_node)
8161 {
8162   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8163   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8164   int i;
8165   int ncopies;
8166   int vec_num;
8167
8168   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8169   gcc_assert (reduc_info->is_reduc_info);
8170
8171   if (nested_in_vect_loop_p (loop, stmt_info))
8172     {
8173       loop = loop->inner;
8174       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8175     }
8176
8177   gimple_match_op op;
8178   if (!gimple_extract_op (stmt_info->stmt, &op))
8179     gcc_unreachable ();
8180
8181   /* All uses but the last are expected to be defined in the loop.
8182      The last use is the reduction variable.  In case of nested cycle this
8183      assumption is not true: we use reduc_index to record the index of the
8184      reduction variable.  */
8185   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8186   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8187   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8188   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8189
8190   if (slp_node)
8191     {
8192       ncopies = 1;
8193       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8194     }
8195   else
8196     {
8197       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8198       vec_num = 1;
8199     }
8200
8201   code_helper code = canonicalize_code (op.code, op.type);
8202   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8203   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8204   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8205   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8206
8207   /* Transform.  */
8208   tree new_temp = NULL_TREE;
8209   auto_vec<tree> vec_oprnds0;
8210   auto_vec<tree> vec_oprnds1;
8211   auto_vec<tree> vec_oprnds2;
8212   tree def0;
8213
8214   if (dump_enabled_p ())
8215     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8216
8217   /* FORNOW: Multiple types are not supported for condition.  */
8218   if (code == COND_EXPR)
8219     gcc_assert (ncopies == 1);
8220
8221   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8222
8223   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8224   if (reduction_type == FOLD_LEFT_REDUCTION)
8225     {
8226       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8227       gcc_assert (code.is_tree_code ());
8228       return vectorize_fold_left_reduction
8229           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8230            tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
8231            lens);
8232     }
8233
8234   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8235   gcc_assert (single_defuse_cycle
8236               || code == DOT_PROD_EXPR
8237               || code == WIDEN_SUM_EXPR
8238               || code == SAD_EXPR);
8239
8240   /* Create the destination vector  */
8241   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8242   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8243
8244   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8245                      single_defuse_cycle && reduc_index == 0
8246                      ? NULL_TREE : op.ops[0], &vec_oprnds0,
8247                      single_defuse_cycle && reduc_index == 1
8248                      ? NULL_TREE : op.ops[1], &vec_oprnds1,
8249                      op.num_ops == 3
8250                      && !(single_defuse_cycle && reduc_index == 2)
8251                      ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8252   if (single_defuse_cycle)
8253     {
8254       gcc_assert (!slp_node);
8255       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8256                                      op.ops[reduc_index],
8257                                      reduc_index == 0 ? &vec_oprnds0
8258                                      : (reduc_index == 1 ? &vec_oprnds1
8259                                         : &vec_oprnds2));
8260     }
8261
8262   bool emulated_mixed_dot_prod
8263     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8264   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8265     {
8266       gimple *new_stmt;
8267       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8268       if (masked_loop_p && !mask_by_cond_expr)
8269         {
8270           /* No conditional ifns have been defined for dot-product yet.  */
8271           gcc_assert (code != DOT_PROD_EXPR);
8272
8273           /* Make sure that the reduction accumulator is vop[0].  */
8274           if (reduc_index == 1)
8275             {
8276               gcc_assert (commutative_binary_op_p (code, op.type));
8277               std::swap (vop[0], vop[1]);
8278             }
8279           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8280                                           vec_num * ncopies, vectype_in, i);
8281           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8282                                                     vop[0], vop[1], vop[0]);
8283           new_temp = make_ssa_name (vec_dest, call);
8284           gimple_call_set_lhs (call, new_temp);
8285           gimple_call_set_nothrow (call, true);
8286           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8287           new_stmt = call;
8288         }
8289       else
8290         {
8291           if (op.num_ops == 3)
8292             vop[2] = vec_oprnds2[i];
8293
8294           if (masked_loop_p && mask_by_cond_expr)
8295             {
8296               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8297                                               vec_num * ncopies, vectype_in, i);
8298               build_vect_cond_expr (code, vop, mask, gsi);
8299             }
8300
8301           if (emulated_mixed_dot_prod)
8302             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8303                                                     vec_dest, vop);
8304           else if (code.is_internal_fn ())
8305             new_stmt = gimple_build_call_internal (internal_fn (code),
8306                                                    op.num_ops,
8307                                                    vop[0], vop[1], vop[2]);
8308           else
8309             new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8310                                             vop[0], vop[1], vop[2]);
8311           new_temp = make_ssa_name (vec_dest, new_stmt);
8312           gimple_set_lhs (new_stmt, new_temp);
8313           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8314         }
8315
8316       if (slp_node)
8317         slp_node->push_vec_def (new_stmt);
8318       else if (single_defuse_cycle
8319                && i < ncopies - 1)
8320         {
8321           if (reduc_index == 0)
8322             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8323           else if (reduc_index == 1)
8324             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8325           else if (reduc_index == 2)
8326             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8327         }
8328       else
8329         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8330     }
8331
8332   if (!slp_node)
8333     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8334
8335   return true;
8336 }
8337
8338 /* Transform phase of a cycle PHI.  */
8339
8340 bool
8341 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8342                           stmt_vec_info stmt_info, gimple **vec_stmt,
8343                           slp_tree slp_node, slp_instance slp_node_instance)
8344 {
8345   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8346   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8347   int i;
8348   int ncopies;
8349   int j;
8350   bool nested_cycle = false;
8351   int vec_num;
8352
8353   if (nested_in_vect_loop_p (loop, stmt_info))
8354     {
8355       loop = loop->inner;
8356       nested_cycle = true;
8357     }
8358
8359   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8360   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8361   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8362   gcc_assert (reduc_info->is_reduc_info);
8363
8364   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8365       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8366     /* Leave the scalar phi in place.  */
8367     return true;
8368
8369   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8370   /* For a nested cycle we do not fill the above.  */
8371   if (!vectype_in)
8372     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8373   gcc_assert (vectype_in);
8374
8375   if (slp_node)
8376     {
8377       /* The size vect_schedule_slp_instance computes is off for us.  */
8378       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8379                                       * SLP_TREE_LANES (slp_node), vectype_in);
8380       ncopies = 1;
8381     }
8382   else
8383     {
8384       vec_num = 1;
8385       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8386     }
8387
8388   /* Check whether we should use a single PHI node and accumulate
8389      vectors to one before the backedge.  */
8390   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8391     ncopies = 1;
8392
8393   /* Create the destination vector  */
8394   gphi *phi = as_a <gphi *> (stmt_info->stmt);
8395   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8396                                                vectype_out);
8397
8398   /* Get the loop-entry arguments.  */
8399   tree vec_initial_def = NULL_TREE;
8400   auto_vec<tree> vec_initial_defs;
8401   if (slp_node)
8402     {
8403       vec_initial_defs.reserve (vec_num);
8404       if (nested_cycle)
8405         {
8406           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8407           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8408                              &vec_initial_defs);
8409         }
8410       else
8411         {
8412           gcc_assert (slp_node == slp_node_instance->reduc_phis);
8413           vec<tree> &initial_values = reduc_info->reduc_initial_values;
8414           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8415
8416           unsigned int num_phis = stmts.length ();
8417           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8418             num_phis = 1;
8419           initial_values.reserve (num_phis);
8420           for (unsigned int i = 0; i < num_phis; ++i)
8421             {
8422               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8423               initial_values.quick_push (vect_phi_initial_value (this_phi));
8424             }
8425           if (vec_num == 1)
8426             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8427           if (!initial_values.is_empty ())
8428             {
8429               tree initial_value
8430                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8431               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8432               tree neutral_op
8433                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8434                                             code, initial_value);
8435               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8436                                               &vec_initial_defs, vec_num,
8437                                               stmts.length (), neutral_op);
8438             }
8439         }
8440     }
8441   else
8442     {
8443       /* Get at the scalar def before the loop, that defines the initial
8444          value of the reduction variable.  */
8445       tree initial_def = vect_phi_initial_value (phi);
8446       reduc_info->reduc_initial_values.safe_push (initial_def);
8447       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8448          and we can't use zero for induc_val, use initial_def.  Similarly
8449          for REDUC_MIN and initial_def larger than the base.  */
8450       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8451         {
8452           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8453           if (TREE_CODE (initial_def) == INTEGER_CST
8454               && !integer_zerop (induc_val)
8455               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8456                    && tree_int_cst_lt (initial_def, induc_val))
8457                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8458                       && tree_int_cst_lt (induc_val, initial_def))))
8459             {
8460               induc_val = initial_def;
8461               /* Communicate we used the initial_def to epilouge
8462                  generation.  */
8463               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8464             }
8465           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8466         }
8467       else if (nested_cycle)
8468         {
8469           /* Do not use an adjustment def as that case is not supported
8470              correctly if ncopies is not one.  */
8471           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8472                                          ncopies, initial_def,
8473                                          &vec_initial_defs);
8474         }
8475       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8476                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8477         /* Fill the initial vector with the initial scalar value.  */
8478         vec_initial_def
8479           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8480                                            initial_def, initial_def);
8481       else
8482         {
8483           if (ncopies == 1)
8484             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8485           if (!reduc_info->reduc_initial_values.is_empty ())
8486             {
8487               initial_def = reduc_info->reduc_initial_values[0];
8488               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8489               tree neutral_op
8490                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8491                                             code, initial_def);
8492               gcc_assert (neutral_op);
8493               /* Try to simplify the vector initialization by applying an
8494                  adjustment after the reduction has been performed.  */
8495               if (!reduc_info->reused_accumulator
8496                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8497                   && !operand_equal_p (neutral_op, initial_def))
8498                 {
8499                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8500                     = initial_def;
8501                   initial_def = neutral_op;
8502                 }
8503               vec_initial_def
8504                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8505                                                  initial_def, neutral_op);
8506             }
8507         }
8508     }
8509
8510   if (vec_initial_def)
8511     {
8512       vec_initial_defs.create (ncopies);
8513       for (i = 0; i < ncopies; ++i)
8514         vec_initial_defs.quick_push (vec_initial_def);
8515     }
8516
8517   if (auto *accumulator = reduc_info->reused_accumulator)
8518     {
8519       tree def = accumulator->reduc_input;
8520       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8521         {
8522           unsigned int nreduc;
8523           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8524                                             (TREE_TYPE (def)),
8525                                           TYPE_VECTOR_SUBPARTS (vectype_out),
8526                                           &nreduc);
8527           gcc_assert (res);
8528           gimple_seq stmts = NULL;
8529           /* Reduce the single vector to a smaller one.  */
8530           if (nreduc != 1)
8531             {
8532               /* Perform the reduction in the appropriate type.  */
8533               tree rvectype = vectype_out;
8534               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8535                                               TREE_TYPE (TREE_TYPE (def))))
8536                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8537                                               TYPE_VECTOR_SUBPARTS
8538                                                 (vectype_out));
8539               def = vect_create_partial_epilog (def, rvectype,
8540                                                 STMT_VINFO_REDUC_CODE
8541                                                   (reduc_info),
8542                                                 &stmts);
8543             }
8544           /* The epilogue loop might use a different vector mode, like
8545              VNx2DI vs. V2DI.  */
8546           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8547             {
8548               tree reduc_type = build_vector_type_for_mode
8549                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8550               def = gimple_convert (&stmts, reduc_type, def);
8551             }
8552           /* Adjust the input so we pick up the partially reduced value
8553              for the skip edge in vect_create_epilog_for_reduction.  */
8554           accumulator->reduc_input = def;
8555           /* And the reduction could be carried out using a different sign.  */
8556           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8557             def = gimple_convert (&stmts, vectype_out, def);
8558           if (loop_vinfo->main_loop_edge)
8559             {
8560               /* While we'd like to insert on the edge this will split
8561                  blocks and disturb bookkeeping, we also will eventually
8562                  need this on the skip edge.  Rely on sinking to
8563                  fixup optimal placement and insert in the pred.  */
8564               gimple_stmt_iterator gsi
8565                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8566               /* Insert before a cond that eventually skips the
8567                  epilogue.  */
8568               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8569                 gsi_prev (&gsi);
8570               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8571             }
8572           else
8573             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8574                                               stmts);
8575         }
8576       if (loop_vinfo->main_loop_edge)
8577         vec_initial_defs[0]
8578           = vect_get_main_loop_result (loop_vinfo, def,
8579                                        vec_initial_defs[0]);
8580       else
8581         vec_initial_defs.safe_push (def);
8582     }
8583
8584   /* Generate the reduction PHIs upfront.  */
8585   for (i = 0; i < vec_num; i++)
8586     {
8587       tree vec_init_def = vec_initial_defs[i];
8588       for (j = 0; j < ncopies; j++)
8589         {
8590           /* Create the reduction-phi that defines the reduction
8591              operand.  */
8592           gphi *new_phi = create_phi_node (vec_dest, loop->header);
8593
8594           /* Set the loop-entry arg of the reduction-phi.  */
8595           if (j != 0 && nested_cycle)
8596             vec_init_def = vec_initial_defs[j];
8597           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8598                        UNKNOWN_LOCATION);
8599
8600           /* The loop-latch arg is set in epilogue processing.  */
8601
8602           if (slp_node)
8603             slp_node->push_vec_def (new_phi);
8604           else
8605             {
8606               if (j == 0)
8607                 *vec_stmt = new_phi;
8608               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8609             }
8610         }
8611     }
8612
8613   return true;
8614 }
8615
8616 /* Vectorizes LC PHIs.  */
8617
8618 bool
8619 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8620                      stmt_vec_info stmt_info, gimple **vec_stmt,
8621                      slp_tree slp_node)
8622 {
8623   if (!loop_vinfo
8624       || !is_a <gphi *> (stmt_info->stmt)
8625       || gimple_phi_num_args (stmt_info->stmt) != 1)
8626     return false;
8627
8628   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8629       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8630     return false;
8631
8632   if (!vec_stmt) /* transformation not required.  */
8633     {
8634       /* Deal with copies from externs or constants that disguise as
8635          loop-closed PHI nodes (PR97886).  */
8636       if (slp_node
8637           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8638                                                 SLP_TREE_VECTYPE (slp_node)))
8639         {
8640           if (dump_enabled_p ())
8641             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8642                              "incompatible vector types for invariants\n");
8643           return false;
8644         }
8645       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8646       return true;
8647     }
8648
8649   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8650   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8651   basic_block bb = gimple_bb (stmt_info->stmt);
8652   edge e = single_pred_edge (bb);
8653   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8654   auto_vec<tree> vec_oprnds;
8655   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8656                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8657                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8658   for (unsigned i = 0; i < vec_oprnds.length (); i++)
8659     {
8660       /* Create the vectorized LC PHI node.  */
8661       gphi *new_phi = create_phi_node (vec_dest, bb);
8662       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8663       if (slp_node)
8664         slp_node->push_vec_def (new_phi);
8665       else
8666         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8667     }
8668   if (!slp_node)
8669     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8670
8671   return true;
8672 }
8673
8674 /* Vectorizes PHIs.  */
8675
8676 bool
8677 vectorizable_phi (vec_info *,
8678                   stmt_vec_info stmt_info, gimple **vec_stmt,
8679                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8680 {
8681   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8682     return false;
8683
8684   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8685     return false;
8686
8687   tree vectype = SLP_TREE_VECTYPE (slp_node);
8688
8689   if (!vec_stmt) /* transformation not required.  */
8690     {
8691       slp_tree child;
8692       unsigned i;
8693       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8694         if (!child)
8695           {
8696             if (dump_enabled_p ())
8697               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8698                                "PHI node with unvectorized backedge def\n");
8699             return false;
8700           }
8701         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8702           {
8703             if (dump_enabled_p ())
8704               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8705                                "incompatible vector types for invariants\n");
8706             return false;
8707           }
8708         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8709                  && !useless_type_conversion_p (vectype,
8710                                                 SLP_TREE_VECTYPE (child)))
8711           {
8712             /* With bools we can have mask and non-mask precision vectors
8713                or different non-mask precisions.  while pattern recog is
8714                supposed to guarantee consistency here bugs in it can cause
8715                mismatches (PR103489 and PR103800 for example).
8716                Deal with them here instead of ICEing later.  */
8717             if (dump_enabled_p ())
8718               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8719                                "incompatible vector type setup from "
8720                                "bool pattern detection\n");
8721             return false;
8722           }
8723
8724       /* For single-argument PHIs assume coalescing which means zero cost
8725          for the scalar and the vector PHIs.  This avoids artificially
8726          favoring the vector path (but may pessimize it in some cases).  */
8727       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8728         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8729                           vector_stmt, stmt_info, vectype, 0, vect_body);
8730       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8731       return true;
8732     }
8733
8734   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8735   basic_block bb = gimple_bb (stmt_info->stmt);
8736   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8737   auto_vec<gphi *> new_phis;
8738   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8739     {
8740       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8741
8742       /* Skip not yet vectorized defs.  */
8743       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8744           && SLP_TREE_VEC_DEFS (child).is_empty ())
8745         continue;
8746
8747       auto_vec<tree> vec_oprnds;
8748       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8749       if (!new_phis.exists ())
8750         {
8751           new_phis.create (vec_oprnds.length ());
8752           for (unsigned j = 0; j < vec_oprnds.length (); j++)
8753             {
8754               /* Create the vectorized LC PHI node.  */
8755               new_phis.quick_push (create_phi_node (vec_dest, bb));
8756               slp_node->push_vec_def (new_phis[j]);
8757             }
8758         }
8759       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8760       for (unsigned j = 0; j < vec_oprnds.length (); j++)
8761         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8762     }
8763   /* We should have at least one already vectorized child.  */
8764   gcc_assert (new_phis.exists ());
8765
8766   return true;
8767 }
8768
8769 /* Vectorizes first order recurrences.  An overview of the transformation
8770    is described below. Suppose we have the following loop.
8771
8772      int t = 0;
8773      for (int i = 0; i < n; ++i)
8774        {
8775          b[i] = a[i] - t;
8776          t = a[i];
8777        }
8778
8779    There is a first-order recurrence on 'a'. For this loop, the scalar IR
8780    looks (simplified) like:
8781
8782     scalar.preheader:
8783       init = 0;
8784
8785     scalar.body:
8786       i = PHI <0(scalar.preheader), i+1(scalar.body)>
8787       _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8788       _1 = a[i]
8789       b[i] = _1 - _2
8790       if (i < n) goto scalar.body
8791
8792    In this example, _2 is a recurrence because it's value depends on the
8793    previous iteration.  We vectorize this as (VF = 4)
8794
8795     vector.preheader:
8796       vect_init = vect_cst(..., ..., ..., 0)
8797
8798     vector.body
8799       i = PHI <0(vector.preheader), i+4(vector.body)>
8800       vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8801       vect_2 = a[i, i+1, i+2, i+3];
8802       vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8803       b[i, i+1, i+2, i+3] = vect_2 - vect_3
8804       if (..) goto vector.body
8805
8806    In this function, vectorizable_recurr, we code generate both the
8807    vector PHI node and the permute since those together compute the
8808    vectorized value of the scalar PHI.  We do not yet have the
8809    backedge value to fill in there nor into the vec_perm.  Those
8810    are filled in maybe_set_vectorized_backedge_value and
8811    vect_schedule_scc.
8812
8813    TODO:  Since the scalar loop does not have a use of the recurrence
8814    outside of the loop the natural way to implement peeling via
8815    vectorizing the live value doesn't work.  For now peeling of loops
8816    with a recurrence is not implemented.  For SLP the supported cases
8817    are restricted to those requiring a single vector recurrence PHI.  */
8818
8819 bool
8820 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8821                      gimple **vec_stmt, slp_tree slp_node,
8822                      stmt_vector_for_cost *cost_vec)
8823 {
8824   if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8825     return false;
8826
8827   gphi *phi = as_a<gphi *> (stmt_info->stmt);
8828
8829   /* So far we only support first-order recurrence auto-vectorization.  */
8830   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8831     return false;
8832
8833   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8834   unsigned ncopies;
8835   if (slp_node)
8836     ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8837   else
8838     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8839   poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8840   unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
8841   /* We need to be able to make progress with a single vector.  */
8842   if (maybe_gt (dist * 2, nunits))
8843     {
8844       if (dump_enabled_p ())
8845         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8846                          "first order recurrence exceeds half of "
8847                          "a vector\n");
8848       return false;
8849     }
8850
8851   /* First-order recurrence autovectorization needs to handle permutation
8852      with indices = [nunits-1, nunits, nunits+1, ...].  */
8853   vec_perm_builder sel (nunits, 1, 3);
8854   for (int i = 0; i < 3; ++i)
8855     sel.quick_push (nunits - dist + i);
8856   vec_perm_indices indices (sel, 2, nunits);
8857
8858   if (!vec_stmt) /* transformation not required.  */
8859     {
8860       if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8861                                  indices))
8862         return false;
8863
8864       if (slp_node)
8865         {
8866           /* We eventually need to set a vector type on invariant
8867              arguments.  */
8868           unsigned j;
8869           slp_tree child;
8870           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8871             if (!vect_maybe_update_slp_op_vectype
8872                   (child, SLP_TREE_VECTYPE (slp_node)))
8873               {
8874                 if (dump_enabled_p ())
8875                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8876                                    "incompatible vector types for "
8877                                    "invariants\n");
8878                 return false;
8879               }
8880         }
8881       /* The recurrence costs the initialization vector and one permute
8882          for each copy.  */
8883       unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
8884                                                  stmt_info, 0, vect_prologue);
8885       unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8886                                                stmt_info, 0, vect_body);
8887       if (dump_enabled_p ())
8888         dump_printf_loc (MSG_NOTE, vect_location,
8889                          "vectorizable_recurr: inside_cost = %d, "
8890                          "prologue_cost = %d .\n", inside_cost,
8891                          prologue_cost);
8892
8893       STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
8894       return true;
8895     }
8896
8897   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8898   basic_block bb = gimple_bb (phi);
8899   tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8900   if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
8901     {
8902       gimple_seq stmts = NULL;
8903       preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
8904       gsi_insert_seq_on_edge_immediate (pe, stmts);
8905     }
8906   tree vec_init = build_vector_from_val (vectype, preheader);
8907   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
8908
8909   /* Create the vectorized first-order PHI node.  */
8910   tree vec_dest = vect_get_new_vect_var (vectype,
8911                                          vect_simple_var, "vec_recur_");
8912   gphi *new_phi = create_phi_node (vec_dest, bb);
8913   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
8914
8915   /* Insert shuffles the first-order recurrence autovectorization.
8916        result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>.  */
8917   tree perm = vect_gen_perm_mask_checked (vectype, indices);
8918
8919   /* Insert the required permute after the latch definition.  The
8920      second and later operands are tentative and will be updated when we have
8921      vectorized the latch definition.  */
8922   edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8923   gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
8924   gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
8925   gsi_next (&gsi2);
8926
8927   for (unsigned i = 0; i < ncopies; ++i)
8928     {
8929       vec_dest = make_ssa_name (vectype);
8930       gassign *vperm
8931           = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
8932                                  i == 0 ? gimple_phi_result (new_phi) : NULL,
8933                                  NULL, perm);
8934       vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
8935
8936       if (slp_node)
8937         slp_node->push_vec_def (vperm);
8938       else
8939         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
8940     }
8941
8942   if (!slp_node)
8943     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8944   return true;
8945 }
8946
8947 /* Return true if VECTYPE represents a vector that requires lowering
8948    by the vector lowering pass.  */
8949
8950 bool
8951 vect_emulated_vector_p (tree vectype)
8952 {
8953   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8954           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8955               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8956 }
8957
8958 /* Return true if we can emulate CODE on an integer mode representation
8959    of a vector.  */
8960
8961 bool
8962 vect_can_vectorize_without_simd_p (tree_code code)
8963 {
8964   switch (code)
8965     {
8966     case PLUS_EXPR:
8967     case MINUS_EXPR:
8968     case NEGATE_EXPR:
8969     case BIT_AND_EXPR:
8970     case BIT_IOR_EXPR:
8971     case BIT_XOR_EXPR:
8972     case BIT_NOT_EXPR:
8973       return true;
8974
8975     default:
8976       return false;
8977     }
8978 }
8979
8980 /* Likewise, but taking a code_helper.  */
8981
8982 bool
8983 vect_can_vectorize_without_simd_p (code_helper code)
8984 {
8985   return (code.is_tree_code ()
8986           && vect_can_vectorize_without_simd_p (tree_code (code)));
8987 }
8988
8989 /* Create vector init for vectorized iv.  */
8990 static tree
8991 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8992                                tree step_expr, poly_uint64 nunits,
8993                                tree vectype,
8994                                enum vect_induction_op_type induction_type)
8995 {
8996   unsigned HOST_WIDE_INT const_nunits;
8997   tree vec_shift, vec_init, new_name;
8998   unsigned i;
8999   tree itype = TREE_TYPE (vectype);
9000
9001   /* iv_loop is the loop to be vectorized. Create:
9002      vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr).  */
9003   new_name = gimple_convert (stmts, itype, init_expr);
9004   switch (induction_type)
9005     {
9006     case vect_step_op_shr:
9007     case vect_step_op_shl:
9008       /* Build the Initial value from shift_expr.  */
9009       vec_init = gimple_build_vector_from_val (stmts,
9010                                                vectype,
9011                                                new_name);
9012       vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9013                                 build_zero_cst (itype), step_expr);
9014       vec_init = gimple_build (stmts,
9015                                (induction_type == vect_step_op_shr
9016                                 ? RSHIFT_EXPR : LSHIFT_EXPR),
9017                                vectype, vec_init, vec_shift);
9018       break;
9019
9020     case vect_step_op_neg:
9021       {
9022         vec_init = gimple_build_vector_from_val (stmts,
9023                                                  vectype,
9024                                                  new_name);
9025         tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9026                                      vectype, vec_init);
9027         /* The encoding has 2 interleaved stepped patterns.  */
9028         vec_perm_builder sel (nunits, 2, 3);
9029         sel.quick_grow (6);
9030         for (i = 0; i < 3; i++)
9031           {
9032             sel[2 * i] = i;
9033             sel[2 * i + 1] = i + nunits;
9034           }
9035         vec_perm_indices indices (sel, 2, nunits);
9036         /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9037            fail when vec_init is const vector. In that situation vec_perm is not
9038            really needed.  */
9039         tree perm_mask_even
9040           = vect_gen_perm_mask_any (vectype, indices);
9041         vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9042                                  vectype,
9043                                  vec_init, vec_neg,
9044                                  perm_mask_even);
9045       }
9046       break;
9047
9048     case vect_step_op_mul:
9049       {
9050         /* Use unsigned mult to avoid UD integer overflow.  */
9051         gcc_assert (nunits.is_constant (&const_nunits));
9052         tree utype = unsigned_type_for (itype);
9053         tree uvectype = build_vector_type (utype,
9054                                            TYPE_VECTOR_SUBPARTS (vectype));
9055         new_name = gimple_convert (stmts, utype, new_name);
9056         vec_init = gimple_build_vector_from_val (stmts,
9057                                                  uvectype,
9058                                                  new_name);
9059         tree_vector_builder elts (uvectype, const_nunits, 1);
9060         tree elt_step = build_one_cst (utype);
9061
9062         elts.quick_push (elt_step);
9063         for (i = 1; i < const_nunits; i++)
9064           {
9065             /* Create: new_name_i = new_name + step_expr.  */
9066             elt_step = gimple_build (stmts, MULT_EXPR,
9067                                      utype, elt_step, step_expr);
9068             elts.quick_push (elt_step);
9069           }
9070         /* Create a vector from [new_name_0, new_name_1, ...,
9071            new_name_nunits-1].  */
9072         tree vec_mul = gimple_build_vector (stmts, &elts);
9073         vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9074                                  vec_init, vec_mul);
9075         vec_init = gimple_convert (stmts, vectype, vec_init);
9076       }
9077       break;
9078
9079     default:
9080       gcc_unreachable ();
9081     }
9082
9083   return vec_init;
9084 }
9085
9086 /* Peel init_expr by skip_niter for induction_type.  */
9087 tree
9088 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9089                              tree skip_niters, tree step_expr,
9090                              enum vect_induction_op_type induction_type)
9091 {
9092   gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9093   tree type = TREE_TYPE (init_expr);
9094   unsigned prec = TYPE_PRECISION (type);
9095   switch (induction_type)
9096     {
9097     case vect_step_op_neg:
9098       if (TREE_INT_CST_LOW (skip_niters) % 2)
9099         init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9100       /* else no change.  */
9101       break;
9102
9103     case vect_step_op_shr:
9104     case vect_step_op_shl:
9105       skip_niters = gimple_convert (stmts, type, skip_niters);
9106       step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9107       /* When shift mount >= precision, need to avoid UD.
9108          In the original loop, there's no UD, and according to semantic,
9109          init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr.  */
9110       if (!tree_fits_uhwi_p (step_expr)
9111           || tree_to_uhwi (step_expr) >= prec)
9112         {
9113           if (induction_type == vect_step_op_shl
9114               || TYPE_UNSIGNED (type))
9115             init_expr = build_zero_cst (type);
9116           else
9117             init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9118                                       init_expr,
9119                                       wide_int_to_tree (type, prec - 1));
9120         }
9121       else
9122         init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9123                                           ? RSHIFT_EXPR : LSHIFT_EXPR),
9124                                   type, init_expr, step_expr);
9125       break;
9126
9127     case vect_step_op_mul:
9128       {
9129         tree utype = unsigned_type_for (type);
9130         init_expr = gimple_convert (stmts, utype, init_expr);
9131         unsigned skipn = TREE_INT_CST_LOW (skip_niters);
9132         wide_int begin = wi::to_wide (step_expr);
9133         for (unsigned i = 0; i != skipn - 1; i++)
9134           begin = wi::mul (begin, wi::to_wide (step_expr));
9135         tree mult_expr = wide_int_to_tree (utype, begin);
9136         init_expr = gimple_build (stmts, MULT_EXPR, utype, init_expr, mult_expr);
9137         init_expr = gimple_convert (stmts, type, init_expr);
9138       }
9139       break;
9140
9141     default:
9142       gcc_unreachable ();
9143     }
9144
9145   return init_expr;
9146 }
9147
9148 /* Create vector step for vectorized iv.  */
9149 static tree
9150 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9151                                poly_uint64 vf,
9152                                enum vect_induction_op_type induction_type)
9153 {
9154   tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9155   tree new_name = NULL;
9156   /* Step should be pow (step, vf) for mult induction.  */
9157   if (induction_type == vect_step_op_mul)
9158     {
9159       gcc_assert (vf.is_constant ());
9160       wide_int begin = wi::to_wide (step_expr);
9161
9162       for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9163         begin = wi::mul (begin, wi::to_wide (step_expr));
9164
9165       new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9166     }
9167   else if (induction_type == vect_step_op_neg)
9168     /* Do nothing.  */
9169     ;
9170   else
9171     new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9172                              expr, step_expr);
9173   return new_name;
9174 }
9175
9176 static tree
9177 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9178                                    stmt_vec_info stmt_info,
9179                                    tree new_name, tree vectype,
9180                                    enum vect_induction_op_type induction_type)
9181 {
9182   /* No step is needed for neg induction.  */
9183   if (induction_type == vect_step_op_neg)
9184     return NULL;
9185
9186   tree t = unshare_expr (new_name);
9187   gcc_assert (CONSTANT_CLASS_P (new_name)
9188               || TREE_CODE (new_name) == SSA_NAME);
9189   tree new_vec = build_vector_from_val (vectype, t);
9190   tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9191                                     new_vec, vectype, NULL);
9192   return vec_step;
9193 }
9194
9195 /* Update vectorized iv with vect_step, induc_def is init.  */
9196 static tree
9197 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9198                           tree induc_def, tree vec_step,
9199                           enum vect_induction_op_type induction_type)
9200 {
9201   tree vec_def = induc_def;
9202   switch (induction_type)
9203     {
9204     case vect_step_op_mul:
9205       {
9206         /* Use unsigned mult to avoid UD integer overflow.  */
9207         tree uvectype
9208           = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9209                                TYPE_VECTOR_SUBPARTS (vectype));
9210         vec_def = gimple_convert (stmts, uvectype, vec_def);
9211         vec_step = gimple_convert (stmts, uvectype, vec_step);
9212         vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9213                                 vec_def, vec_step);
9214         vec_def = gimple_convert (stmts, vectype, vec_def);
9215       }
9216       break;
9217
9218     case vect_step_op_shr:
9219       vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9220                               vec_def, vec_step);
9221       break;
9222
9223     case vect_step_op_shl:
9224       vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9225                               vec_def, vec_step);
9226       break;
9227     case vect_step_op_neg:
9228       vec_def = induc_def;
9229       /* Do nothing.  */
9230       break;
9231     default:
9232       gcc_unreachable ();
9233     }
9234
9235   return vec_def;
9236
9237 }
9238
9239 /* Function vectorizable_induction
9240
9241    Check if STMT_INFO performs an nonlinear induction computation that can be
9242    vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9243    a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9244    basic block.
9245    Return true if STMT_INFO is vectorizable in this way.  */
9246
9247 static bool
9248 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9249                                   stmt_vec_info stmt_info,
9250                                   gimple **vec_stmt, slp_tree slp_node,
9251                                   stmt_vector_for_cost *cost_vec)
9252 {
9253   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9254   unsigned ncopies;
9255   bool nested_in_vect_loop = false;
9256   class loop *iv_loop;
9257   tree vec_def;
9258   edge pe = loop_preheader_edge (loop);
9259   basic_block new_bb;
9260   tree vec_init, vec_step;
9261   tree new_name;
9262   gimple *new_stmt;
9263   gphi *induction_phi;
9264   tree induc_def, vec_dest;
9265   tree init_expr, step_expr;
9266   tree niters_skip;
9267   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9268   unsigned i;
9269   gimple_stmt_iterator si;
9270
9271   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9272
9273   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9274   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9275   enum vect_induction_op_type induction_type
9276     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9277
9278   gcc_assert (induction_type > vect_step_op_add);
9279
9280   if (slp_node)
9281     ncopies = 1;
9282   else
9283     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9284   gcc_assert (ncopies >= 1);
9285
9286   /* FORNOW. Only handle nonlinear induction in the same loop.  */
9287   if (nested_in_vect_loop_p (loop, stmt_info))
9288     {
9289       if (dump_enabled_p ())
9290         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9291                          "nonlinear induction in nested loop.\n");
9292       return false;
9293     }
9294
9295   iv_loop = loop;
9296   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9297
9298   /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9299      update for each iv and a permutation to generate wanted vector iv.  */
9300   if (slp_node)
9301     {
9302       if (dump_enabled_p ())
9303         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9304                          "SLP induction not supported for nonlinear"
9305                          " induction.\n");
9306       return false;
9307     }
9308
9309   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9310     {
9311       if (dump_enabled_p ())
9312         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9313                          "floating point nonlinear induction vectorization"
9314                          " not supported.\n");
9315       return false;
9316     }
9317
9318   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9319   init_expr = vect_phi_initial_value (phi);
9320   gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9321               && TREE_CODE (step_expr) == INTEGER_CST);
9322   /* step_expr should be aligned with init_expr,
9323      .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used.  */
9324   step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9325
9326   if (TREE_CODE (init_expr) == INTEGER_CST)
9327     init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9328   else
9329     gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
9330                                        TREE_TYPE (init_expr)));
9331
9332   switch (induction_type)
9333     {
9334     case vect_step_op_neg:
9335       if (TREE_CODE (init_expr) != INTEGER_CST
9336           && TREE_CODE (init_expr) != REAL_CST)
9337         {
9338           /* Check for backend support of NEGATE_EXPR and vec_perm.  */
9339           if (!directly_supported_p (NEGATE_EXPR, vectype))
9340             return false;
9341
9342           /* The encoding has 2 interleaved stepped patterns.  */
9343           vec_perm_builder sel (nunits, 2, 3);
9344           machine_mode mode = TYPE_MODE (vectype);
9345           sel.quick_grow (6);
9346           for (i = 0; i < 3; i++)
9347             {
9348               sel[i * 2] = i;
9349               sel[i * 2 + 1] = i + nunits;
9350             }
9351           vec_perm_indices indices (sel, 2, nunits);
9352           if (!can_vec_perm_const_p (mode, mode, indices))
9353             return false;
9354         }
9355       break;
9356
9357     case vect_step_op_mul:
9358       {
9359         /* Check for backend support of MULT_EXPR.  */
9360         if (!directly_supported_p (MULT_EXPR, vectype))
9361           return false;
9362
9363         /* ?? How to construct vector step for variable number vector.
9364            [ 1, step, pow (step, 2), pow (step, 4), .. ].  */
9365         if (!vf.is_constant ())
9366           return false;
9367       }
9368       break;
9369
9370     case vect_step_op_shr:
9371       /* Check for backend support of RSHIFT_EXPR.  */
9372       if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9373         return false;
9374
9375       /* Don't shift more than type precision to avoid UD.  */
9376       if (!tree_fits_uhwi_p (step_expr)
9377           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9378                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9379         return false;
9380       break;
9381
9382     case vect_step_op_shl:
9383       /* Check for backend support of RSHIFT_EXPR.  */
9384       if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9385         return false;
9386
9387       /* Don't shift more than type precision to avoid UD.  */
9388       if (!tree_fits_uhwi_p (step_expr)
9389           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9390                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9391         return false;
9392
9393       break;
9394
9395     default:
9396       gcc_unreachable ();
9397     }
9398
9399   if (!vec_stmt) /* transformation not required.  */
9400     {
9401       unsigned inside_cost = 0, prologue_cost = 0;
9402       /* loop cost for vec_loop. Neg induction doesn't have any
9403          inside_cost.  */
9404       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9405                                       stmt_info, 0, vect_body);
9406
9407       /* loop cost for vec_loop. Neg induction doesn't have any
9408          inside_cost.  */
9409       if (induction_type == vect_step_op_neg)
9410         inside_cost = 0;
9411
9412       /* prologue cost for vec_init and vec_step.  */
9413       prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9414                                         stmt_info, 0, vect_prologue);
9415
9416       if (dump_enabled_p ())
9417         dump_printf_loc (MSG_NOTE, vect_location,
9418                          "vect_model_induction_cost: inside_cost = %d, "
9419                          "prologue_cost = %d. \n", inside_cost,
9420                          prologue_cost);
9421
9422       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9423       DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9424       return true;
9425     }
9426
9427   /* Transform.  */
9428
9429   /* Compute a vector variable, initialized with the first VF values of
9430      the induction variable.  E.g., for an iv with IV_PHI='X' and
9431      evolution S, for a vector of 4 units, we want to compute:
9432      [X, X + S, X + 2*S, X + 3*S].  */
9433
9434   if (dump_enabled_p ())
9435     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9436
9437   pe = loop_preheader_edge (iv_loop);
9438   /* Find the first insertion point in the BB.  */
9439   basic_block bb = gimple_bb (phi);
9440   si = gsi_after_labels (bb);
9441
9442   gimple_seq stmts = NULL;
9443
9444   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9445   /* If we are using the loop mask to "peel" for alignment then we need
9446      to adjust the start value here.  */
9447   if (niters_skip != NULL_TREE)
9448     init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9449                                              step_expr, induction_type);
9450
9451   vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9452                                             step_expr, nunits, vectype,
9453                                             induction_type);
9454   if (stmts)
9455     {
9456       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9457       gcc_assert (!new_bb);
9458     }
9459
9460   stmts = NULL;
9461   new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9462                                             vf, induction_type);
9463   if (stmts)
9464     {
9465       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9466       gcc_assert (!new_bb);
9467     }
9468
9469   vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9470                                                 new_name, vectype,
9471                                                 induction_type);
9472   /* Create the following def-use cycle:
9473      loop prolog:
9474      vec_init = ...
9475      vec_step = ...
9476      loop:
9477      vec_iv = PHI <vec_init, vec_loop>
9478      ...
9479      STMT
9480      ...
9481      vec_loop = vec_iv + vec_step;  */
9482
9483   /* Create the induction-phi that defines the induction-operand.  */
9484   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9485   induction_phi = create_phi_node (vec_dest, iv_loop->header);
9486   induc_def = PHI_RESULT (induction_phi);
9487
9488   /* Create the iv update inside the loop.  */
9489   stmts = NULL;
9490   vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9491                                       induc_def, vec_step,
9492                                       induction_type);
9493
9494   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9495   new_stmt = SSA_NAME_DEF_STMT (vec_def);
9496
9497   /* Set the arguments of the phi node:  */
9498   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9499   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9500                UNKNOWN_LOCATION);
9501
9502   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9503   *vec_stmt = induction_phi;
9504
9505   /* In case that vectorization factor (VF) is bigger than the number
9506      of elements that we can fit in a vectype (nunits), we have to generate
9507      more than one vector stmt - i.e - we need to "unroll" the
9508      vector stmt by a factor VF/nunits.  For more details see documentation
9509      in vectorizable_operation.  */
9510
9511   if (ncopies > 1)
9512     {
9513       stmts = NULL;
9514       /* FORNOW. This restriction should be relaxed.  */
9515       gcc_assert (!nested_in_vect_loop);
9516
9517       new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9518                                                 nunits, induction_type);
9519
9520       vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9521                                                     new_name, vectype,
9522                                                     induction_type);
9523       vec_def = induc_def;
9524       for (i = 1; i < ncopies; i++)
9525         {
9526           /* vec_i = vec_prev + vec_step.  */
9527           stmts = NULL;
9528           vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9529                                               vec_def, vec_step,
9530                                               induction_type);
9531           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9532           new_stmt = SSA_NAME_DEF_STMT (vec_def);
9533           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9534         }
9535     }
9536
9537   if (dump_enabled_p ())
9538     dump_printf_loc (MSG_NOTE, vect_location,
9539                      "transform induction: created def-use cycle: %G%G",
9540                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9541
9542   return true;
9543 }
9544
9545 /* Function vectorizable_induction
9546
9547    Check if STMT_INFO performs an induction computation that can be vectorized.
9548    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9549    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9550    Return true if STMT_INFO is vectorizable in this way.  */
9551
9552 bool
9553 vectorizable_induction (loop_vec_info loop_vinfo,
9554                         stmt_vec_info stmt_info,
9555                         gimple **vec_stmt, slp_tree slp_node,
9556                         stmt_vector_for_cost *cost_vec)
9557 {
9558   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9559   unsigned ncopies;
9560   bool nested_in_vect_loop = false;
9561   class loop *iv_loop;
9562   tree vec_def;
9563   edge pe = loop_preheader_edge (loop);
9564   basic_block new_bb;
9565   tree new_vec, vec_init, vec_step, t;
9566   tree new_name;
9567   gimple *new_stmt;
9568   gphi *induction_phi;
9569   tree induc_def, vec_dest;
9570   tree init_expr, step_expr;
9571   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9572   unsigned i;
9573   tree expr;
9574   gimple_stmt_iterator si;
9575   enum vect_induction_op_type induction_type
9576     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9577
9578   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9579   if (!phi)
9580     return false;
9581
9582   if (!STMT_VINFO_RELEVANT_P (stmt_info))
9583     return false;
9584
9585   /* Make sure it was recognized as induction computation.  */
9586   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9587     return false;
9588
9589   /* Handle nonlinear induction in a separate place.  */
9590   if (induction_type != vect_step_op_add)
9591     return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9592                                              vec_stmt, slp_node, cost_vec);
9593
9594   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9595   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9596
9597   if (slp_node)
9598     ncopies = 1;
9599   else
9600     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9601   gcc_assert (ncopies >= 1);
9602
9603   /* FORNOW. These restrictions should be relaxed.  */
9604   if (nested_in_vect_loop_p (loop, stmt_info))
9605     {
9606       imm_use_iterator imm_iter;
9607       use_operand_p use_p;
9608       gimple *exit_phi;
9609       edge latch_e;
9610       tree loop_arg;
9611
9612       if (ncopies > 1)
9613         {
9614           if (dump_enabled_p ())
9615             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9616                              "multiple types in nested loop.\n");
9617           return false;
9618         }
9619
9620       exit_phi = NULL;
9621       latch_e = loop_latch_edge (loop->inner);
9622       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9623       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9624         {
9625           gimple *use_stmt = USE_STMT (use_p);
9626           if (is_gimple_debug (use_stmt))
9627             continue;
9628
9629           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9630             {
9631               exit_phi = use_stmt;
9632               break;
9633             }
9634         }
9635       if (exit_phi)
9636         {
9637           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9638           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9639                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9640             {
9641               if (dump_enabled_p ())
9642                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9643                                  "inner-loop induction only used outside "
9644                                  "of the outer vectorized loop.\n");
9645               return false;
9646             }
9647         }
9648
9649       nested_in_vect_loop = true;
9650       iv_loop = loop->inner;
9651     }
9652   else
9653     iv_loop = loop;
9654   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9655
9656   if (slp_node && !nunits.is_constant ())
9657     {
9658       /* The current SLP code creates the step value element-by-element.  */
9659       if (dump_enabled_p ())
9660         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9661                          "SLP induction not supported for variable-length"
9662                          " vectors.\n");
9663       return false;
9664     }
9665
9666   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9667     {
9668       if (dump_enabled_p ())
9669         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9670                          "floating point induction vectorization disabled\n");
9671       return false;
9672     }
9673
9674   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9675   gcc_assert (step_expr != NULL_TREE);
9676   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9677
9678   /* Check for backend support of PLUS/MINUS_EXPR. */
9679   if (!directly_supported_p (PLUS_EXPR, step_vectype)
9680       || !directly_supported_p (MINUS_EXPR, step_vectype))
9681     return false;
9682
9683   if (!vec_stmt) /* transformation not required.  */
9684     {
9685       unsigned inside_cost = 0, prologue_cost = 0;
9686       if (slp_node)
9687         {
9688           /* We eventually need to set a vector type on invariant
9689              arguments.  */
9690           unsigned j;
9691           slp_tree child;
9692           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9693             if (!vect_maybe_update_slp_op_vectype
9694                 (child, SLP_TREE_VECTYPE (slp_node)))
9695               {
9696                 if (dump_enabled_p ())
9697                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9698                                    "incompatible vector types for "
9699                                    "invariants\n");
9700                 return false;
9701               }
9702           /* loop cost for vec_loop.  */
9703           inside_cost
9704             = record_stmt_cost (cost_vec,
9705                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9706                                 vector_stmt, stmt_info, 0, vect_body);
9707           /* prologue cost for vec_init (if not nested) and step.  */
9708           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9709                                             scalar_to_vec,
9710                                             stmt_info, 0, vect_prologue);
9711         }
9712       else /* if (!slp_node) */
9713         {
9714           /* loop cost for vec_loop.  */
9715           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9716                                           stmt_info, 0, vect_body);
9717           /* prologue cost for vec_init and vec_step.  */
9718           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9719                                             stmt_info, 0, vect_prologue);
9720         }
9721       if (dump_enabled_p ())
9722         dump_printf_loc (MSG_NOTE, vect_location,
9723                          "vect_model_induction_cost: inside_cost = %d, "
9724                          "prologue_cost = %d .\n", inside_cost,
9725                          prologue_cost);
9726
9727       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9728       DUMP_VECT_SCOPE ("vectorizable_induction");
9729       return true;
9730     }
9731
9732   /* Transform.  */
9733
9734   /* Compute a vector variable, initialized with the first VF values of
9735      the induction variable.  E.g., for an iv with IV_PHI='X' and
9736      evolution S, for a vector of 4 units, we want to compute:
9737      [X, X + S, X + 2*S, X + 3*S].  */
9738
9739   if (dump_enabled_p ())
9740     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9741
9742   pe = loop_preheader_edge (iv_loop);
9743   /* Find the first insertion point in the BB.  */
9744   basic_block bb = gimple_bb (phi);
9745   si = gsi_after_labels (bb);
9746
9747   /* For SLP induction we have to generate several IVs as for example
9748      with group size 3 we need
9749        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9750        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
9751   if (slp_node)
9752     {
9753       /* Enforced above.  */
9754       unsigned int const_nunits = nunits.to_constant ();
9755
9756       /* The initial values are vectorized, but any lanes > group_size
9757          need adjustment.  */
9758       slp_tree init_node
9759         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9760
9761       /* Gather steps.  Since we do not vectorize inductions as
9762          cycles we have to reconstruct the step from SCEV data.  */
9763       unsigned group_size = SLP_TREE_LANES (slp_node);
9764       tree *steps = XALLOCAVEC (tree, group_size);
9765       tree *inits = XALLOCAVEC (tree, group_size);
9766       stmt_vec_info phi_info;
9767       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9768         {
9769           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9770           if (!init_node)
9771             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9772                                            pe->dest_idx);
9773         }
9774
9775       /* Now generate the IVs.  */
9776       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9777       gcc_assert ((const_nunits * nvects) % group_size == 0);
9778       unsigned nivs;
9779       if (nested_in_vect_loop)
9780         nivs = nvects;
9781       else
9782         {
9783           /* Compute the number of distinct IVs we need.  First reduce
9784              group_size if it is a multiple of const_nunits so we get
9785              one IV for a group_size of 4 but const_nunits 2.  */
9786           unsigned group_sizep = group_size;
9787           if (group_sizep % const_nunits == 0)
9788             group_sizep = group_sizep / const_nunits;
9789           nivs = least_common_multiple (group_sizep,
9790                                         const_nunits) / const_nunits;
9791         }
9792       tree stept = TREE_TYPE (step_vectype);
9793       tree lupdate_mul = NULL_TREE;
9794       if (!nested_in_vect_loop)
9795         {
9796           /* The number of iterations covered in one vector iteration.  */
9797           unsigned lup_mul = (nvects * const_nunits) / group_size;
9798           lupdate_mul
9799             = build_vector_from_val (step_vectype,
9800                                      SCALAR_FLOAT_TYPE_P (stept)
9801                                      ? build_real_from_wide (stept, lup_mul,
9802                                                              UNSIGNED)
9803                                      : build_int_cstu (stept, lup_mul));
9804         }
9805       tree peel_mul = NULL_TREE;
9806       gimple_seq init_stmts = NULL;
9807       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9808         {
9809           if (SCALAR_FLOAT_TYPE_P (stept))
9810             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9811                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9812           else
9813             peel_mul = gimple_convert (&init_stmts, stept,
9814                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9815           peel_mul = gimple_build_vector_from_val (&init_stmts,
9816                                                    step_vectype, peel_mul);
9817         }
9818       unsigned ivn;
9819       auto_vec<tree> vec_steps;
9820       for (ivn = 0; ivn < nivs; ++ivn)
9821         {
9822           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9823           tree_vector_builder init_elts (vectype, const_nunits, 1);
9824           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9825           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9826             {
9827               /* The scalar steps of the IVs.  */
9828               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9829               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9830               step_elts.quick_push (elt);
9831               if (!init_node)
9832                 {
9833                   /* The scalar inits of the IVs if not vectorized.  */
9834                   elt = inits[(ivn*const_nunits + eltn) % group_size];
9835                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
9836                                                   TREE_TYPE (elt)))
9837                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9838                                         TREE_TYPE (vectype), elt);
9839                   init_elts.quick_push (elt);
9840                 }
9841               /* The number of steps to add to the initial values.  */
9842               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9843               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9844                                    ? build_real_from_wide (stept,
9845                                                            mul_elt, UNSIGNED)
9846                                    : build_int_cstu (stept, mul_elt));
9847             }
9848           vec_step = gimple_build_vector (&init_stmts, &step_elts);
9849           vec_steps.safe_push (vec_step);
9850           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9851           if (peel_mul)
9852             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9853                                      step_mul, peel_mul);
9854           if (!init_node)
9855             vec_init = gimple_build_vector (&init_stmts, &init_elts);
9856
9857           /* Create the induction-phi that defines the induction-operand.  */
9858           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9859                                             "vec_iv_");
9860           induction_phi = create_phi_node (vec_dest, iv_loop->header);
9861           induc_def = PHI_RESULT (induction_phi);
9862
9863           /* Create the iv update inside the loop  */
9864           tree up = vec_step;
9865           if (lupdate_mul)
9866             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9867                                vec_step, lupdate_mul);
9868           gimple_seq stmts = NULL;
9869           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9870           vec_def = gimple_build (&stmts,
9871                                   PLUS_EXPR, step_vectype, vec_def, up);
9872           vec_def = gimple_convert (&stmts, vectype, vec_def);
9873           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9874           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9875                        UNKNOWN_LOCATION);
9876
9877           if (init_node)
9878             vec_init = vect_get_slp_vect_def (init_node, ivn);
9879           if (!nested_in_vect_loop
9880               && !integer_zerop (step_mul))
9881             {
9882               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9883               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9884                                  vec_step, step_mul);
9885               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9886                                       vec_def, up);
9887               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9888             }
9889
9890           /* Set the arguments of the phi node:  */
9891           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9892
9893           slp_node->push_vec_def (induction_phi);
9894         }
9895       if (!nested_in_vect_loop)
9896         {
9897           /* Fill up to the number of vectors we need for the whole group.  */
9898           nivs = least_common_multiple (group_size,
9899                                         const_nunits) / const_nunits;
9900           vec_steps.reserve (nivs-ivn);
9901           for (; ivn < nivs; ++ivn)
9902             {
9903               slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
9904               vec_steps.quick_push (vec_steps[0]);
9905             }
9906         }
9907
9908       /* Re-use IVs when we can.  We are generating further vector
9909          stmts by adding VF' * stride to the IVs generated above.  */
9910       if (ivn < nvects)
9911         {
9912           unsigned vfp
9913             = least_common_multiple (group_size, const_nunits) / group_size;
9914           tree lupdate_mul
9915             = build_vector_from_val (step_vectype,
9916                                      SCALAR_FLOAT_TYPE_P (stept)
9917                                      ? build_real_from_wide (stept,
9918                                                              vfp, UNSIGNED)
9919                                      : build_int_cstu (stept, vfp));
9920           for (; ivn < nvects; ++ivn)
9921             {
9922               gimple *iv
9923                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
9924               tree def = gimple_get_lhs (iv);
9925               if (ivn < 2*nivs)
9926                 vec_steps[ivn - nivs]
9927                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9928                                   vec_steps[ivn - nivs], lupdate_mul);
9929               gimple_seq stmts = NULL;
9930               def = gimple_convert (&stmts, step_vectype, def);
9931               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9932                                   def, vec_steps[ivn % nivs]);
9933               def = gimple_convert (&stmts, vectype, def);
9934               if (gimple_code (iv) == GIMPLE_PHI)
9935                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9936               else
9937                 {
9938                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
9939                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
9940                 }
9941               slp_node->push_vec_def (def);
9942             }
9943         }
9944
9945       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
9946       gcc_assert (!new_bb);
9947
9948       return true;
9949     }
9950
9951   init_expr = vect_phi_initial_value (phi);
9952
9953   gimple_seq stmts = NULL;
9954   if (!nested_in_vect_loop)
9955     {
9956       /* Convert the initial value to the IV update type.  */
9957       tree new_type = TREE_TYPE (step_expr);
9958       init_expr = gimple_convert (&stmts, new_type, init_expr);
9959
9960       /* If we are using the loop mask to "peel" for alignment then we need
9961          to adjust the start value here.  */
9962       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9963       if (skip_niters != NULL_TREE)
9964         {
9965           if (FLOAT_TYPE_P (vectype))
9966             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
9967                                         skip_niters);
9968           else
9969             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
9970           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
9971                                          skip_niters, step_expr);
9972           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
9973                                     init_expr, skip_step);
9974         }
9975     }
9976
9977   if (stmts)
9978     {
9979       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9980       gcc_assert (!new_bb);
9981     }
9982
9983   /* Create the vector that holds the initial_value of the induction.  */
9984   if (nested_in_vect_loop)
9985     {
9986       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
9987          been created during vectorization of previous stmts.  We obtain it
9988          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
9989       auto_vec<tree> vec_inits;
9990       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
9991                                      init_expr, &vec_inits);
9992       vec_init = vec_inits[0];
9993       /* If the initial value is not of proper type, convert it.  */
9994       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
9995         {
9996           new_stmt
9997             = gimple_build_assign (vect_get_new_ssa_name (vectype,
9998                                                           vect_simple_var,
9999                                                           "vec_iv_"),
10000                                    VIEW_CONVERT_EXPR,
10001                                    build1 (VIEW_CONVERT_EXPR, vectype,
10002                                            vec_init));
10003           vec_init = gimple_assign_lhs (new_stmt);
10004           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10005                                                  new_stmt);
10006           gcc_assert (!new_bb);
10007         }
10008     }
10009   else
10010     {
10011       /* iv_loop is the loop to be vectorized. Create:
10012          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
10013       stmts = NULL;
10014       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10015
10016       unsigned HOST_WIDE_INT const_nunits;
10017       if (nunits.is_constant (&const_nunits))
10018         {
10019           tree_vector_builder elts (step_vectype, const_nunits, 1);
10020           elts.quick_push (new_name);
10021           for (i = 1; i < const_nunits; i++)
10022             {
10023               /* Create: new_name_i = new_name + step_expr  */
10024               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10025                                        new_name, step_expr);
10026               elts.quick_push (new_name);
10027             }
10028           /* Create a vector from [new_name_0, new_name_1, ...,
10029              new_name_nunits-1]  */
10030           vec_init = gimple_build_vector (&stmts, &elts);
10031         }
10032       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10033         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
10034         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10035                                  new_name, step_expr);
10036       else
10037         {
10038           /* Build:
10039                 [base, base, base, ...]
10040                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
10041           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10042           gcc_assert (flag_associative_math);
10043           tree index = build_index_vector (step_vectype, 0, 1);
10044           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10045                                                         new_name);
10046           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10047                                                         step_expr);
10048           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10049           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10050                                    vec_init, step_vec);
10051           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10052                                    vec_init, base_vec);
10053         }
10054       vec_init = gimple_convert (&stmts, vectype, vec_init);
10055
10056       if (stmts)
10057         {
10058           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10059           gcc_assert (!new_bb);
10060         }
10061     }
10062
10063
10064   /* Create the vector that holds the step of the induction.  */
10065   if (nested_in_vect_loop)
10066     /* iv_loop is nested in the loop to be vectorized. Generate:
10067        vec_step = [S, S, S, S]  */
10068     new_name = step_expr;
10069   else
10070     {
10071       /* iv_loop is the loop to be vectorized. Generate:
10072           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
10073       gimple_seq seq = NULL;
10074       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10075         {
10076           expr = build_int_cst (integer_type_node, vf);
10077           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10078         }
10079       else
10080         expr = build_int_cst (TREE_TYPE (step_expr), vf);
10081       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10082                                expr, step_expr);
10083       if (seq)
10084         {
10085           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10086           gcc_assert (!new_bb);
10087         }
10088     }
10089
10090   t = unshare_expr (new_name);
10091   gcc_assert (CONSTANT_CLASS_P (new_name)
10092               || TREE_CODE (new_name) == SSA_NAME);
10093   new_vec = build_vector_from_val (step_vectype, t);
10094   vec_step = vect_init_vector (loop_vinfo, stmt_info,
10095                                new_vec, step_vectype, NULL);
10096
10097
10098   /* Create the following def-use cycle:
10099      loop prolog:
10100          vec_init = ...
10101          vec_step = ...
10102      loop:
10103          vec_iv = PHI <vec_init, vec_loop>
10104          ...
10105          STMT
10106          ...
10107          vec_loop = vec_iv + vec_step;  */
10108
10109   /* Create the induction-phi that defines the induction-operand.  */
10110   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10111   induction_phi = create_phi_node (vec_dest, iv_loop->header);
10112   induc_def = PHI_RESULT (induction_phi);
10113
10114   /* Create the iv update inside the loop  */
10115   stmts = NULL;
10116   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10117   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10118   vec_def = gimple_convert (&stmts, vectype, vec_def);
10119   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10120   new_stmt = SSA_NAME_DEF_STMT (vec_def);
10121
10122   /* Set the arguments of the phi node:  */
10123   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10124   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10125                UNKNOWN_LOCATION);
10126
10127   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10128   *vec_stmt = induction_phi;
10129
10130   /* In case that vectorization factor (VF) is bigger than the number
10131      of elements that we can fit in a vectype (nunits), we have to generate
10132      more than one vector stmt - i.e - we need to "unroll" the
10133      vector stmt by a factor VF/nunits.  For more details see documentation
10134      in vectorizable_operation.  */
10135
10136   if (ncopies > 1)
10137     {
10138       gimple_seq seq = NULL;
10139       /* FORNOW. This restriction should be relaxed.  */
10140       gcc_assert (!nested_in_vect_loop);
10141
10142       /* Create the vector that holds the step of the induction.  */
10143       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10144         {
10145           expr = build_int_cst (integer_type_node, nunits);
10146           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10147         }
10148       else
10149         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10150       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10151                                expr, step_expr);
10152       if (seq)
10153         {
10154           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10155           gcc_assert (!new_bb);
10156         }
10157
10158       t = unshare_expr (new_name);
10159       gcc_assert (CONSTANT_CLASS_P (new_name)
10160                   || TREE_CODE (new_name) == SSA_NAME);
10161       new_vec = build_vector_from_val (step_vectype, t);
10162       vec_step = vect_init_vector (loop_vinfo, stmt_info,
10163                                    new_vec, step_vectype, NULL);
10164
10165       vec_def = induc_def;
10166       for (i = 1; i < ncopies + 1; i++)
10167         {
10168           /* vec_i = vec_prev + vec_step  */
10169           gimple_seq stmts = NULL;
10170           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10171           vec_def = gimple_build (&stmts,
10172                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
10173           vec_def = gimple_convert (&stmts, vectype, vec_def);
10174
10175           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10176           if (i < ncopies)
10177             {
10178               new_stmt = SSA_NAME_DEF_STMT (vec_def);
10179               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10180             }
10181           else
10182             {
10183               /* vec_1 = vec_iv + (VF/n * S)
10184                  vec_2 = vec_1 + (VF/n * S)
10185                  ...
10186                  vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10187
10188                  vec_n is used as vec_loop to save the large step register and
10189                  related operations.  */
10190               add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10191                            UNKNOWN_LOCATION);
10192             }
10193         }
10194     }
10195
10196   if (dump_enabled_p ())
10197     dump_printf_loc (MSG_NOTE, vect_location,
10198                      "transform induction: created def-use cycle: %G%G",
10199                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10200
10201   return true;
10202 }
10203
10204 /* Function vectorizable_live_operation.
10205
10206    STMT_INFO computes a value that is used outside the loop.  Check if
10207    it can be supported.  */
10208
10209 bool
10210 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10211                              slp_tree slp_node, slp_instance slp_node_instance,
10212                              int slp_index, bool vec_stmt_p,
10213                              stmt_vector_for_cost *cost_vec)
10214 {
10215   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10216   imm_use_iterator imm_iter;
10217   tree lhs, lhs_type, bitsize;
10218   tree vectype = (slp_node
10219                   ? SLP_TREE_VECTYPE (slp_node)
10220                   : STMT_VINFO_VECTYPE (stmt_info));
10221   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10222   int ncopies;
10223   gimple *use_stmt;
10224   auto_vec<tree> vec_oprnds;
10225   int vec_entry = 0;
10226   poly_uint64 vec_index = 0;
10227
10228   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
10229
10230   /* If a stmt of a reduction is live, vectorize it via
10231      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
10232      validity so just trigger the transform here.  */
10233   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10234     {
10235       if (!vec_stmt_p)
10236         return true;
10237       if (slp_node)
10238         {
10239           /* For reduction chains the meta-info is attached to
10240              the group leader.  */
10241           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10242             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10243           /* For SLP reductions we vectorize the epilogue for
10244              all involved stmts together.  */
10245           else if (slp_index != 0)
10246             return true;
10247         }
10248       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10249       gcc_assert (reduc_info->is_reduc_info);
10250       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10251           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10252         return true;
10253       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10254                                         slp_node_instance);
10255       return true;
10256     }
10257
10258   /* If STMT is not relevant and it is a simple assignment and its inputs are
10259      invariant then it can remain in place, unvectorized.  The original last
10260      scalar value that it computes will be used.  */
10261   if (!STMT_VINFO_RELEVANT_P (stmt_info))
10262     {
10263       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10264       if (dump_enabled_p ())
10265         dump_printf_loc (MSG_NOTE, vect_location,
10266                          "statement is simple and uses invariant.  Leaving in "
10267                          "place.\n");
10268       return true;
10269     }
10270
10271   if (slp_node)
10272     ncopies = 1;
10273   else
10274     ncopies = vect_get_num_copies (loop_vinfo, vectype);
10275
10276   if (slp_node)
10277     {
10278       gcc_assert (slp_index >= 0);
10279
10280       /* Get the last occurrence of the scalar index from the concatenation of
10281          all the slp vectors. Calculate which slp vector it is and the index
10282          within.  */
10283       int num_scalar = SLP_TREE_LANES (slp_node);
10284       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10285       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10286
10287       /* Calculate which vector contains the result, and which lane of
10288          that vector we need.  */
10289       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10290         {
10291           if (dump_enabled_p ())
10292             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10293                              "Cannot determine which vector holds the"
10294                              " final result.\n");
10295           return false;
10296         }
10297     }
10298
10299   if (!vec_stmt_p)
10300     {
10301       /* No transformation required.  */
10302       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10303         {
10304           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10305                                                OPTIMIZE_FOR_SPEED))
10306             {
10307               if (dump_enabled_p ())
10308                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10309                                  "can't operate on partial vectors "
10310                                  "because the target doesn't support extract "
10311                                  "last reduction.\n");
10312               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10313             }
10314           else if (slp_node)
10315             {
10316               if (dump_enabled_p ())
10317                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10318                                  "can't operate on partial vectors "
10319                                  "because an SLP statement is live after "
10320                                  "the loop.\n");
10321               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10322             }
10323           else if (ncopies > 1)
10324             {
10325               if (dump_enabled_p ())
10326                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10327                                  "can't operate on partial vectors "
10328                                  "because ncopies is greater than 1.\n");
10329               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10330             }
10331           else
10332             {
10333               gcc_assert (ncopies == 1 && !slp_node);
10334               vect_record_loop_mask (loop_vinfo,
10335                                      &LOOP_VINFO_MASKS (loop_vinfo),
10336                                      1, vectype, NULL);
10337             }
10338         }
10339       /* ???  Enable for loop costing as well.  */
10340       if (!loop_vinfo)
10341         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10342                           0, vect_epilogue);
10343       return true;
10344     }
10345
10346   /* Use the lhs of the original scalar statement.  */
10347   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10348   if (dump_enabled_p ())
10349     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10350                      "stmt %G", stmt);
10351
10352   lhs = gimple_get_lhs (stmt);
10353   lhs_type = TREE_TYPE (lhs);
10354
10355   bitsize = vector_element_bits_tree (vectype);
10356
10357   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
10358   tree vec_lhs, bitstart;
10359   gimple *vec_stmt;
10360   if (slp_node)
10361     {
10362       gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
10363
10364       /* Get the correct slp vectorized stmt.  */
10365       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10366       vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10367
10368       /* Get entry to use.  */
10369       bitstart = bitsize_int (vec_index);
10370       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10371     }
10372   else
10373     {
10374       /* For multiple copies, get the last copy.  */
10375       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10376       vec_lhs = gimple_get_lhs (vec_stmt);
10377
10378       /* Get the last lane in the vector.  */
10379       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10380     }
10381
10382   if (loop_vinfo)
10383     {
10384       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10385          requirement, insert one phi node for it.  It looks like:
10386            loop;
10387          BB:
10388            # lhs' = PHI <lhs>
10389          ==>
10390            loop;
10391          BB:
10392            # vec_lhs' = PHI <vec_lhs>
10393            new_tree = lane_extract <vec_lhs', ...>;
10394            lhs' = new_tree;  */
10395
10396       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10397       basic_block exit_bb = single_exit (loop)->dest;
10398       gcc_assert (single_pred_p (exit_bb));
10399
10400       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10401       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10402       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
10403
10404       gimple_seq stmts = NULL;
10405       tree new_tree;
10406       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10407         {
10408           /* Emit:
10409
10410                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10411
10412              where VEC_LHS is the vectorized live-out result and MASK is
10413              the loop mask for the final iteration.  */
10414           gcc_assert (ncopies == 1 && !slp_node);
10415           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10416           gimple_seq tem = NULL;
10417           gimple_stmt_iterator gsi = gsi_last (tem);
10418           tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10419                                           &LOOP_VINFO_MASKS (loop_vinfo),
10420                                           1, vectype, 0);
10421           gimple_seq_add_seq (&stmts, tem);
10422           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10423                                           mask, vec_lhs_phi);
10424
10425           /* Convert the extracted vector element to the scalar type.  */
10426           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10427         }
10428       else
10429         {
10430           tree bftype = TREE_TYPE (vectype);
10431           if (VECTOR_BOOLEAN_TYPE_P (vectype))
10432             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10433           new_tree = build3 (BIT_FIELD_REF, bftype,
10434                              vec_lhs_phi, bitsize, bitstart);
10435           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10436                                            &stmts, true, NULL_TREE);
10437         }
10438
10439       if (stmts)
10440         {
10441           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
10442           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10443
10444           /* Remove existing phi from lhs and create one copy from new_tree.  */
10445           tree lhs_phi = NULL_TREE;
10446           gimple_stmt_iterator gsi;
10447           for (gsi = gsi_start_phis (exit_bb);
10448                !gsi_end_p (gsi); gsi_next (&gsi))
10449             {
10450               gimple *phi = gsi_stmt (gsi);
10451               if ((gimple_phi_arg_def (phi, 0) == lhs))
10452                 {
10453                   remove_phi_node (&gsi, false);
10454                   lhs_phi = gimple_phi_result (phi);
10455                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10456                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10457                   break;
10458                 }
10459             }
10460         }
10461
10462       /* Replace use of lhs with newly computed result.  If the use stmt is a
10463          single arg PHI, just replace all uses of PHI result.  It's necessary
10464          because lcssa PHI defining lhs may be before newly inserted stmt.  */
10465       use_operand_p use_p;
10466       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10467         if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
10468             && !is_gimple_debug (use_stmt))
10469           {
10470             if (gimple_code (use_stmt) == GIMPLE_PHI
10471                 && gimple_phi_num_args (use_stmt) == 1)
10472               {
10473                 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
10474               }
10475             else
10476               {
10477                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10478                     SET_USE (use_p, new_tree);
10479               }
10480             update_stmt (use_stmt);
10481           }
10482     }
10483   else
10484     {
10485       /* For basic-block vectorization simply insert the lane-extraction.  */
10486       tree bftype = TREE_TYPE (vectype);
10487       if (VECTOR_BOOLEAN_TYPE_P (vectype))
10488         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10489       tree new_tree = build3 (BIT_FIELD_REF, bftype,
10490                               vec_lhs, bitsize, bitstart);
10491       gimple_seq stmts = NULL;
10492       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10493                                        &stmts, true, NULL_TREE);
10494       if (TREE_CODE (new_tree) == SSA_NAME
10495           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10496         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10497       if (is_a <gphi *> (vec_stmt))
10498         {
10499           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10500           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10501         }
10502       else
10503         {
10504           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10505           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10506         }
10507
10508       /* Replace use of lhs with newly computed result.  If the use stmt is a
10509          single arg PHI, just replace all uses of PHI result.  It's necessary
10510          because lcssa PHI defining lhs may be before newly inserted stmt.  */
10511       use_operand_p use_p;
10512       stmt_vec_info use_stmt_info;
10513       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10514         if (!is_gimple_debug (use_stmt)
10515             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10516                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10517           {
10518             /* ???  This can happen when the live lane ends up being
10519                used in a vector construction code-generated by an
10520                external SLP node (and code-generation for that already
10521                happened).  See gcc.dg/vect/bb-slp-47.c.
10522                Doing this is what would happen if that vector CTOR
10523                were not code-generated yet so it is not too bad.
10524                ???  In fact we'd likely want to avoid this situation
10525                in the first place.  */
10526             if (TREE_CODE (new_tree) == SSA_NAME
10527                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10528                 && gimple_code (use_stmt) != GIMPLE_PHI
10529                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10530                                                 use_stmt))
10531               {
10532                 enum tree_code code = gimple_assign_rhs_code (use_stmt);
10533                 gcc_checking_assert (code == SSA_NAME
10534                                      || code == CONSTRUCTOR
10535                                      || code == VIEW_CONVERT_EXPR
10536                                      || CONVERT_EXPR_CODE_P (code));
10537                 if (dump_enabled_p ())
10538                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10539                                    "Using original scalar computation for "
10540                                    "live lane because use preceeds vector "
10541                                    "def\n");
10542                 continue;
10543               }
10544             /* ???  It can also happen that we end up pulling a def into
10545                a loop where replacing out-of-loop uses would require
10546                a new LC SSA PHI node.  Retain the original scalar in
10547                those cases as well.  PR98064.  */
10548             if (TREE_CODE (new_tree) == SSA_NAME
10549                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10550                 && (gimple_bb (use_stmt)->loop_father
10551                     != gimple_bb (vec_stmt)->loop_father)
10552                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10553                                         gimple_bb (use_stmt)->loop_father))
10554               {
10555                 if (dump_enabled_p ())
10556                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10557                                    "Using original scalar computation for "
10558                                    "live lane because there is an out-of-loop "
10559                                    "definition for it\n");
10560                 continue;
10561               }
10562             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10563               SET_USE (use_p, new_tree);
10564             update_stmt (use_stmt);
10565           }
10566     }
10567
10568   return true;
10569 }
10570
10571 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
10572
10573 static void
10574 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10575 {
10576   ssa_op_iter op_iter;
10577   imm_use_iterator imm_iter;
10578   def_operand_p def_p;
10579   gimple *ustmt;
10580
10581   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10582     {
10583       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10584         {
10585           basic_block bb;
10586
10587           if (!is_gimple_debug (ustmt))
10588             continue;
10589
10590           bb = gimple_bb (ustmt);
10591
10592           if (!flow_bb_inside_loop_p (loop, bb))
10593             {
10594               if (gimple_debug_bind_p (ustmt))
10595                 {
10596                   if (dump_enabled_p ())
10597                     dump_printf_loc (MSG_NOTE, vect_location,
10598                                      "killing debug use\n");
10599
10600                   gimple_debug_bind_reset_value (ustmt);
10601                   update_stmt (ustmt);
10602                 }
10603               else
10604                 gcc_unreachable ();
10605             }
10606         }
10607     }
10608 }
10609
10610 /* Given loop represented by LOOP_VINFO, return true if computation of
10611    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10612    otherwise.  */
10613
10614 static bool
10615 loop_niters_no_overflow (loop_vec_info loop_vinfo)
10616 {
10617   /* Constant case.  */
10618   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10619     {
10620       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10621       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10622
10623       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10624       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10625       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10626         return true;
10627     }
10628
10629   widest_int max;
10630   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10631   /* Check the upper bound of loop niters.  */
10632   if (get_max_loop_iterations (loop, &max))
10633     {
10634       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10635       signop sgn = TYPE_SIGN (type);
10636       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10637       if (max < type_max)
10638         return true;
10639     }
10640   return false;
10641 }
10642
10643 /* Return a mask type with half the number of elements as OLD_TYPE,
10644    given that it should have mode NEW_MODE.  */
10645
10646 tree
10647 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10648 {
10649   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10650   return build_truth_vector_type_for_mode (nunits, new_mode);
10651 }
10652
10653 /* Return a mask type with twice as many elements as OLD_TYPE,
10654    given that it should have mode NEW_MODE.  */
10655
10656 tree
10657 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10658 {
10659   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10660   return build_truth_vector_type_for_mode (nunits, new_mode);
10661 }
10662
10663 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10664    contain a sequence of NVECTORS masks that each control a vector of type
10665    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
10666    these vector masks with the vector version of SCALAR_MASK.  */
10667
10668 void
10669 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10670                        unsigned int nvectors, tree vectype, tree scalar_mask)
10671 {
10672   gcc_assert (nvectors != 0);
10673
10674   if (scalar_mask)
10675     {
10676       scalar_cond_masked_key cond (scalar_mask, nvectors);
10677       loop_vinfo->scalar_cond_masked_set.add (cond);
10678     }
10679
10680   masks->mask_set.add (std::make_pair (vectype, nvectors));
10681 }
10682
10683 /* Given a complete set of masks MASKS, extract mask number INDEX
10684    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10685    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
10686
10687    See the comment above vec_loop_masks for more details about the mask
10688    arrangement.  */
10689
10690 tree
10691 vect_get_loop_mask (loop_vec_info loop_vinfo,
10692                     gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10693                     unsigned int nvectors, tree vectype, unsigned int index)
10694 {
10695   if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10696       == vect_partial_vectors_while_ult)
10697     {
10698       rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10699       tree mask_type = rgm->type;
10700
10701       /* Populate the rgroup's mask array, if this is the first time we've
10702          used it.  */
10703       if (rgm->controls.is_empty ())
10704         {
10705           rgm->controls.safe_grow_cleared (nvectors, true);
10706           for (unsigned int i = 0; i < nvectors; ++i)
10707             {
10708               tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10709               /* Provide a dummy definition until the real one is available.  */
10710               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10711               rgm->controls[i] = mask;
10712             }
10713         }
10714
10715       tree mask = rgm->controls[index];
10716       if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10717                     TYPE_VECTOR_SUBPARTS (vectype)))
10718         {
10719           /* A loop mask for data type X can be reused for data type Y
10720              if X has N times more elements than Y and if Y's elements
10721              are N times bigger than X's.  In this case each sequence
10722              of N elements in the loop mask will be all-zero or all-one.
10723              We can then view-convert the mask so that each sequence of
10724              N elements is replaced by a single element.  */
10725           gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10726                                   TYPE_VECTOR_SUBPARTS (vectype)));
10727           gimple_seq seq = NULL;
10728           mask_type = truth_type_for (vectype);
10729           mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10730           if (seq)
10731             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10732         }
10733       return mask;
10734     }
10735   else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10736            == vect_partial_vectors_avx512)
10737     {
10738       /* The number of scalars per iteration and the number of vectors are
10739          both compile-time constants.  */
10740       unsigned int nscalars_per_iter
10741         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10742                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10743
10744       rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
10745
10746       /* The stored nV is dependent on the mask type produced.  */
10747       gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10748                              TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
10749                   == rgm->factor);
10750       nvectors = rgm->factor;
10751
10752       /* Populate the rgroup's mask array, if this is the first time we've
10753          used it.  */
10754       if (rgm->controls.is_empty ())
10755         {
10756           rgm->controls.safe_grow_cleared (nvectors, true);
10757           for (unsigned int i = 0; i < nvectors; ++i)
10758             {
10759               tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
10760               /* Provide a dummy definition until the real one is available.  */
10761               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10762               rgm->controls[i] = mask;
10763             }
10764         }
10765       if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
10766                     TYPE_VECTOR_SUBPARTS (vectype)))
10767         return rgm->controls[index];
10768
10769       /* Split the vector if needed.  Since we are dealing with integer mode
10770          masks with AVX512 we can operate on the integer representation
10771          performing the whole vector shifting.  */
10772       unsigned HOST_WIDE_INT factor;
10773       bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
10774                                      TYPE_VECTOR_SUBPARTS (vectype), &factor);
10775       gcc_assert (ok);
10776       gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
10777       tree mask_type = truth_type_for (vectype);
10778       gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
10779       unsigned vi = index / factor;
10780       unsigned vpart = index % factor;
10781       tree vec = rgm->controls[vi];
10782       gimple_seq seq = NULL;
10783       vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
10784                           lang_hooks.types.type_for_mode
10785                                 (TYPE_MODE (rgm->type), 1), vec);
10786       /* For integer mode masks simply shift the right bits into position.  */
10787       if (vpart != 0)
10788         vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
10789                             build_int_cst (integer_type_node,
10790                                            (TYPE_VECTOR_SUBPARTS (vectype)
10791                                             * vpart)));
10792       vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
10793                                     (TYPE_MODE (mask_type), 1), vec);
10794       vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
10795       if (seq)
10796         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10797       return vec;
10798     }
10799   else
10800     gcc_unreachable ();
10801 }
10802
10803 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10804    lengths for controlling an operation on VECTYPE.  The operation splits
10805    each element of VECTYPE into FACTOR separate subelements, measuring the
10806    length as a number of these subelements.  */
10807
10808 void
10809 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10810                       unsigned int nvectors, tree vectype, unsigned int factor)
10811 {
10812   gcc_assert (nvectors != 0);
10813   if (lens->length () < nvectors)
10814     lens->safe_grow_cleared (nvectors, true);
10815   rgroup_controls *rgl = &(*lens)[nvectors - 1];
10816
10817   /* The number of scalars per iteration, scalar occupied bytes and
10818      the number of vectors are both compile-time constants.  */
10819   unsigned int nscalars_per_iter
10820     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10821                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10822
10823   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10824     {
10825       /* For now, we only support cases in which all loads and stores fall back
10826          to VnQI or none do.  */
10827       gcc_assert (!rgl->max_nscalars_per_iter
10828                   || (rgl->factor == 1 && factor == 1)
10829                   || (rgl->max_nscalars_per_iter * rgl->factor
10830                       == nscalars_per_iter * factor));
10831       rgl->max_nscalars_per_iter = nscalars_per_iter;
10832       rgl->type = vectype;
10833       rgl->factor = factor;
10834     }
10835 }
10836
10837 /* Given a complete set of lengths LENS, extract length number INDEX
10838    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10839    where 0 <= INDEX < NVECTORS.  Return a value that contains FACTOR
10840    multipled by the number of elements that should be processed.
10841    Insert any set-up statements before GSI.  */
10842
10843 tree
10844 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10845                    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
10846                    unsigned int index, unsigned int factor)
10847 {
10848   rgroup_controls *rgl = &(*lens)[nvectors - 1];
10849   bool use_bias_adjusted_len =
10850     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10851
10852   /* Populate the rgroup's len array, if this is the first time we've
10853      used it.  */
10854   if (rgl->controls.is_empty ())
10855     {
10856       rgl->controls.safe_grow_cleared (nvectors, true);
10857       for (unsigned int i = 0; i < nvectors; ++i)
10858         {
10859           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10860           gcc_assert (len_type != NULL_TREE);
10861
10862           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10863
10864           /* Provide a dummy definition until the real one is available.  */
10865           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10866           rgl->controls[i] = len;
10867
10868           if (use_bias_adjusted_len)
10869             {
10870               gcc_assert (i == 0);
10871               tree adjusted_len =
10872                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10873               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10874               rgl->bias_adjusted_ctrl = adjusted_len;
10875             }
10876         }
10877     }
10878
10879   if (use_bias_adjusted_len)
10880     return rgl->bias_adjusted_ctrl;
10881
10882   tree loop_len = rgl->controls[index];
10883   if (rgl->factor == 1 && factor == 1)
10884     {
10885       poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
10886       poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
10887       if (maybe_ne (nunits1, nunits2))
10888         {
10889           /* A loop len for data type X can be reused for data type Y
10890              if X has N times more elements than Y and if Y's elements
10891              are N times bigger than X's.  */
10892           gcc_assert (multiple_p (nunits1, nunits2));
10893           factor = exact_div (nunits1, nunits2).to_constant ();
10894           tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10895           gimple_seq seq = NULL;
10896           loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
10897                                    build_int_cst (iv_type, factor));
10898           if (seq)
10899             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10900         }
10901     }
10902   return loop_len;
10903 }
10904
10905 /* Scale profiling counters by estimation for LOOP which is vectorized
10906    by factor VF.
10907    If FLAT is true, the loop we started with had unrealistically flat
10908    profile.  */
10909
10910 static void
10911 scale_profile_for_vect_loop (class loop *loop, unsigned vf, bool flat)
10912 {
10913   /* For flat profiles do not scale down proportionally by VF and only
10914      cap by known iteration count bounds.  */
10915   if (flat)
10916     {
10917       if (dump_file && (dump_flags & TDF_DETAILS))
10918         fprintf (dump_file,
10919                  "Vectorized loop profile seems flat; not scaling iteration "
10920                  "count down by the vectorization factor %i\n", vf);
10921       scale_loop_profile (loop, profile_probability::always (),
10922                           get_likely_max_loop_iterations_int (loop));
10923       return;
10924     }
10925   /* Loop body executes VF fewer times and exit increases VF times.  */
10926   edge exit_e = single_exit (loop);
10927   profile_count entry_count = loop_preheader_edge (loop)->count ();
10928
10929   /* If we have unreliable loop profile avoid dropping entry
10930      count bellow header count.  This can happen since loops
10931      has unrealistically low trip counts.  */
10932   while (vf > 1
10933          && loop->header->count > entry_count
10934          && loop->header->count < entry_count * vf)
10935     {
10936       if (dump_file && (dump_flags & TDF_DETAILS))
10937         fprintf (dump_file,
10938                  "Vectorization factor %i seems too large for profile "
10939                  "prevoiusly believed to be consistent; reducing.\n", vf);
10940       vf /= 2;
10941     }
10942
10943   if (entry_count.nonzero_p ())
10944     set_edge_probability_and_rescale_others
10945             (exit_e,
10946              entry_count.probability_in (loop->header->count / vf));
10947   /* Avoid producing very large exit probability when we do not have
10948      sensible profile.  */
10949   else if (exit_e->probability < profile_probability::always () / (vf * 2))
10950     set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
10951   loop->latch->count = single_pred_edge (loop->latch)->count ();
10952
10953   scale_loop_profile (loop, profile_probability::always () / vf,
10954                       get_likely_max_loop_iterations_int (loop));
10955 }
10956
10957 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
10958    latch edge values originally defined by it.  */
10959
10960 static void
10961 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
10962                                      stmt_vec_info def_stmt_info)
10963 {
10964   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
10965   if (!def || TREE_CODE (def) != SSA_NAME)
10966     return;
10967   stmt_vec_info phi_info;
10968   imm_use_iterator iter;
10969   use_operand_p use_p;
10970   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
10971     {
10972       gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
10973       if (!phi)
10974         continue;
10975       if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
10976             && (phi_info = loop_vinfo->lookup_stmt (phi))
10977             && STMT_VINFO_RELEVANT_P (phi_info)))
10978         continue;
10979       loop_p loop = gimple_bb (phi)->loop_father;
10980       edge e = loop_latch_edge (loop);
10981       if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
10982         continue;
10983
10984       if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
10985           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
10986           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
10987         {
10988           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
10989           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
10990           gcc_assert (phi_defs.length () == latch_defs.length ());
10991           for (unsigned i = 0; i < phi_defs.length (); ++i)
10992             add_phi_arg (as_a <gphi *> (phi_defs[i]),
10993                          gimple_get_lhs (latch_defs[i]), e,
10994                          gimple_phi_arg_location (phi, e->dest_idx));
10995         }
10996       else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
10997         {
10998           /* For first order recurrences we have to update both uses of
10999              the latch definition, the one in the PHI node and the one
11000              in the generated VEC_PERM_EXPR.  */
11001           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11002           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11003           gcc_assert (phi_defs.length () == latch_defs.length ());
11004           tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11005           gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11006           for (unsigned i = 0; i < phi_defs.length (); ++i)
11007             {
11008               gassign *perm = as_a <gassign *> (phi_defs[i]);
11009               if (i > 0)
11010                 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11011               gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11012               update_stmt (perm);
11013             }
11014           add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11015                        gimple_phi_arg_location (phi, e->dest_idx));
11016         }
11017     }
11018 }
11019
11020 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11021    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11022    stmt_vec_info.  */
11023
11024 static bool
11025 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11026                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11027 {
11028   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11029   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11030
11031   if (dump_enabled_p ())
11032     dump_printf_loc (MSG_NOTE, vect_location,
11033                      "------>vectorizing statement: %G", stmt_info->stmt);
11034
11035   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11036     vect_loop_kill_debug_uses (loop, stmt_info);
11037
11038   if (!STMT_VINFO_RELEVANT_P (stmt_info)
11039       && !STMT_VINFO_LIVE_P (stmt_info))
11040     return false;
11041
11042   if (STMT_VINFO_VECTYPE (stmt_info))
11043     {
11044       poly_uint64 nunits
11045         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11046       if (!STMT_SLP_TYPE (stmt_info)
11047           && maybe_ne (nunits, vf)
11048           && dump_enabled_p ())
11049         /* For SLP VF is set according to unrolling factor, and not
11050            to vector size, hence for SLP this print is not valid.  */
11051         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11052     }
11053
11054   /* Pure SLP statements have already been vectorized.  We still need
11055      to apply loop vectorization to hybrid SLP statements.  */
11056   if (PURE_SLP_STMT (stmt_info))
11057     return false;
11058
11059   if (dump_enabled_p ())
11060     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11061
11062   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11063     *seen_store = stmt_info;
11064
11065   return true;
11066 }
11067
11068 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11069    in the hash_map with its corresponding values.  */
11070
11071 static tree
11072 find_in_mapping (tree t, void *context)
11073 {
11074   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11075
11076   tree *value = mapping->get (t);
11077   return value ? *value : t;
11078 }
11079
11080 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
11081    original loop that has now been vectorized.
11082
11083    The inits of the data_references need to be advanced with the number of
11084    iterations of the main loop.  This has been computed in vect_do_peeling and
11085    is stored in parameter ADVANCE.  We first restore the data_references
11086    initial offset with the values recored in ORIG_DRS_INIT.
11087
11088    Since the loop_vec_info of this EPILOGUE was constructed for the original
11089    loop, its stmt_vec_infos all point to the original statements.  These need
11090    to be updated to point to their corresponding copies as well as the SSA_NAMES
11091    in their PATTERN_DEF_SEQs and RELATED_STMTs.
11092
11093    The data_reference's connections also need to be updated.  Their
11094    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11095    stmt_vec_infos, their statements need to point to their corresponding copy,
11096    if they are gather loads or scatter stores then their reference needs to be
11097    updated to point to its corresponding copy and finally we set
11098    'base_misaligned' to false as we have already peeled for alignment in the
11099    prologue of the main loop.  */
11100
11101 static void
11102 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11103 {
11104   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11105   auto_vec<gimple *> stmt_worklist;
11106   hash_map<tree,tree> mapping;
11107   gimple *orig_stmt, *new_stmt;
11108   gimple_stmt_iterator epilogue_gsi;
11109   gphi_iterator epilogue_phi_gsi;
11110   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11111   basic_block *epilogue_bbs = get_loop_body (epilogue);
11112   unsigned i;
11113
11114   free (LOOP_VINFO_BBS (epilogue_vinfo));
11115   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11116
11117   /* Advance data_reference's with the number of iterations of the previous
11118      loop and its prologue.  */
11119   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11120
11121
11122   /* The EPILOGUE loop is a copy of the original loop so they share the same
11123      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
11124      point to the copied statements.  We also create a mapping of all LHS' in
11125      the original loop and all the LHS' in the EPILOGUE and create worklists to
11126      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
11127   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11128     {
11129       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11130            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11131         {
11132           new_stmt = epilogue_phi_gsi.phi ();
11133
11134           gcc_assert (gimple_uid (new_stmt) > 0);
11135           stmt_vinfo
11136             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11137
11138           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11139           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11140
11141           mapping.put (gimple_phi_result (orig_stmt),
11142                        gimple_phi_result (new_stmt));
11143           /* PHI nodes can not have patterns or related statements.  */
11144           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11145                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11146         }
11147
11148       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11149            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11150         {
11151           new_stmt = gsi_stmt (epilogue_gsi);
11152           if (is_gimple_debug (new_stmt))
11153             continue;
11154
11155           gcc_assert (gimple_uid (new_stmt) > 0);
11156           stmt_vinfo
11157             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11158
11159           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11160           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11161
11162           if (tree old_lhs = gimple_get_lhs (orig_stmt))
11163             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11164
11165           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11166             {
11167               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11168               for (gimple_stmt_iterator gsi = gsi_start (seq);
11169                    !gsi_end_p (gsi); gsi_next (&gsi))
11170                 stmt_worklist.safe_push (gsi_stmt (gsi));
11171             }
11172
11173           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11174           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11175             {
11176               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11177               stmt_worklist.safe_push (stmt);
11178               /* Set BB such that the assert in
11179                 'get_initial_def_for_reduction' is able to determine that
11180                 the BB of the related stmt is inside this loop.  */
11181               gimple_set_bb (stmt,
11182                              gimple_bb (new_stmt));
11183               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11184               gcc_assert (related_vinfo == NULL
11185                           || related_vinfo == stmt_vinfo);
11186             }
11187         }
11188     }
11189
11190   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11191      using the original main loop and thus need to be updated to refer to the
11192      cloned variables used in the epilogue.  */
11193   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11194     {
11195       gimple *stmt = stmt_worklist[i];
11196       tree *new_op;
11197
11198       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11199         {
11200           tree op = gimple_op (stmt, j);
11201           if ((new_op = mapping.get(op)))
11202             gimple_set_op (stmt, j, *new_op);
11203           else
11204             {
11205               /* PR92429: The last argument of simplify_replace_tree disables
11206                  folding when replacing arguments.  This is required as
11207                  otherwise you might end up with different statements than the
11208                  ones analyzed in vect_loop_analyze, leading to different
11209                  vectorization.  */
11210               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11211                                           &find_in_mapping, &mapping, false);
11212               gimple_set_op (stmt, j, op);
11213             }
11214         }
11215     }
11216
11217   struct data_reference *dr;
11218   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11219   FOR_EACH_VEC_ELT (datarefs, i, dr)
11220     {
11221       orig_stmt = DR_STMT (dr);
11222       gcc_assert (gimple_uid (orig_stmt) > 0);
11223       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11224       /* Data references for gather loads and scatter stores do not use the
11225          updated offset we set using ADVANCE.  Instead we have to make sure the
11226          reference in the data references point to the corresponding copy of
11227          the original in the epilogue.  */
11228       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
11229           == VMAT_GATHER_SCATTER)
11230         {
11231           DR_REF (dr)
11232             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11233                                      &find_in_mapping, &mapping);
11234           DR_BASE_ADDRESS (dr)
11235             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11236                                      &find_in_mapping, &mapping);
11237         }
11238       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11239       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11240       /* The vector size of the epilogue is smaller than that of the main loop
11241          so the alignment is either the same or lower. This means the dr will
11242          thus by definition be aligned.  */
11243       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11244     }
11245
11246   epilogue_vinfo->shared->datarefs_copy.release ();
11247   epilogue_vinfo->shared->save_datarefs ();
11248 }
11249
11250 /* Function vect_transform_loop.
11251
11252    The analysis phase has determined that the loop is vectorizable.
11253    Vectorize the loop - created vectorized stmts to replace the scalar
11254    stmts in the loop, and update the loop exit condition.
11255    Returns scalar epilogue loop if any.  */
11256
11257 class loop *
11258 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11259 {
11260   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11261   class loop *epilogue = NULL;
11262   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11263   int nbbs = loop->num_nodes;
11264   int i;
11265   tree niters_vector = NULL_TREE;
11266   tree step_vector = NULL_TREE;
11267   tree niters_vector_mult_vf = NULL_TREE;
11268   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11269   unsigned int lowest_vf = constant_lower_bound (vf);
11270   gimple *stmt;
11271   bool check_profitability = false;
11272   unsigned int th;
11273   bool flat = maybe_flat_loop_profile (loop);
11274
11275   DUMP_VECT_SCOPE ("vec_transform_loop");
11276
11277   loop_vinfo->shared->check_datarefs ();
11278
11279   /* Use the more conservative vectorization threshold.  If the number
11280      of iterations is constant assume the cost check has been performed
11281      by our caller.  If the threshold makes all loops profitable that
11282      run at least the (estimated) vectorization factor number of times
11283      checking is pointless, too.  */
11284   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11285   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11286     {
11287       if (dump_enabled_p ())
11288         dump_printf_loc (MSG_NOTE, vect_location,
11289                          "Profitability threshold is %d loop iterations.\n",
11290                          th);
11291       check_profitability = true;
11292     }
11293
11294   /* Make sure there exists a single-predecessor exit bb.  Do this before
11295      versioning.   */
11296   edge e = single_exit (loop);
11297   if (! single_pred_p (e->dest))
11298     {
11299       split_loop_exit_edge (e, true);
11300       if (dump_enabled_p ())
11301         dump_printf (MSG_NOTE, "split exit edge\n");
11302     }
11303
11304   /* Version the loop first, if required, so the profitability check
11305      comes first.  */
11306
11307   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11308     {
11309       class loop *sloop
11310         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11311       sloop->force_vectorize = false;
11312       check_profitability = false;
11313     }
11314
11315   /* Make sure there exists a single-predecessor exit bb also on the
11316      scalar loop copy.  Do this after versioning but before peeling
11317      so CFG structure is fine for both scalar and if-converted loop
11318      to make slpeel_duplicate_current_defs_from_edges face matched
11319      loop closed PHI nodes on the exit.  */
11320   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11321     {
11322       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
11323       if (! single_pred_p (e->dest))
11324         {
11325           split_loop_exit_edge (e, true);
11326           if (dump_enabled_p ())
11327             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11328         }
11329     }
11330
11331   tree niters = vect_build_loop_niters (loop_vinfo);
11332   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11333   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11334   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11335   tree advance;
11336   drs_init_vec orig_drs_init;
11337
11338   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11339                               &step_vector, &niters_vector_mult_vf, th,
11340                               check_profitability, niters_no_overflow,
11341                               &advance);
11342   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11343       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11344     {
11345       /* Ifcvt duplicates loop preheader, loop body and produces an basic
11346          block after loop exit.  We need to scale all that.  */
11347       basic_block preheader
11348         = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11349       preheader->count
11350         = preheader->count.apply_probability
11351               (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11352       scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11353                               LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11354       single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->dest->count
11355         = preheader->count;
11356     }
11357
11358   if (niters_vector == NULL_TREE)
11359     {
11360       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11361           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11362           && known_eq (lowest_vf, vf))
11363         {
11364           niters_vector
11365             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11366                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11367           step_vector = build_one_cst (TREE_TYPE (niters));
11368         }
11369       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11370         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11371                                      &step_vector, niters_no_overflow);
11372       else
11373         /* vect_do_peeling subtracted the number of peeled prologue
11374            iterations from LOOP_VINFO_NITERS.  */
11375         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11376                                      &niters_vector, &step_vector,
11377                                      niters_no_overflow);
11378     }
11379
11380   /* 1) Make sure the loop header has exactly two entries
11381      2) Make sure we have a preheader basic block.  */
11382
11383   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11384
11385   split_edge (loop_preheader_edge (loop));
11386
11387   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11388     /* This will deal with any possible peeling.  */
11389     vect_prepare_for_masked_peels (loop_vinfo);
11390
11391   /* Schedule the SLP instances first, then handle loop vectorization
11392      below.  */
11393   if (!loop_vinfo->slp_instances.is_empty ())
11394     {
11395       DUMP_VECT_SCOPE ("scheduling SLP instances");
11396       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11397     }
11398
11399   /* FORNOW: the vectorizer supports only loops which body consist
11400      of one basic block (header + empty latch). When the vectorizer will
11401      support more involved loop forms, the order by which the BBs are
11402      traversed need to be reconsidered.  */
11403
11404   for (i = 0; i < nbbs; i++)
11405     {
11406       basic_block bb = bbs[i];
11407       stmt_vec_info stmt_info;
11408
11409       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11410            gsi_next (&si))
11411         {
11412           gphi *phi = si.phi ();
11413           if (dump_enabled_p ())
11414             dump_printf_loc (MSG_NOTE, vect_location,
11415                              "------>vectorizing phi: %G", (gimple *) phi);
11416           stmt_info = loop_vinfo->lookup_stmt (phi);
11417           if (!stmt_info)
11418             continue;
11419
11420           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11421             vect_loop_kill_debug_uses (loop, stmt_info);
11422
11423           if (!STMT_VINFO_RELEVANT_P (stmt_info)
11424               && !STMT_VINFO_LIVE_P (stmt_info))
11425             continue;
11426
11427           if (STMT_VINFO_VECTYPE (stmt_info)
11428               && (maybe_ne
11429                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
11430               && dump_enabled_p ())
11431             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11432
11433           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11434                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11435                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11436                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11437                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
11438                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
11439               && ! PURE_SLP_STMT (stmt_info))
11440             {
11441               if (dump_enabled_p ())
11442                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
11443               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
11444             }
11445         }
11446
11447       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11448            gsi_next (&si))
11449         {
11450           gphi *phi = si.phi ();
11451           stmt_info = loop_vinfo->lookup_stmt (phi);
11452           if (!stmt_info)
11453             continue;
11454
11455           if (!STMT_VINFO_RELEVANT_P (stmt_info)
11456               && !STMT_VINFO_LIVE_P (stmt_info))
11457             continue;
11458
11459           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11460                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11461                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11462                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11463                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
11464                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
11465               && ! PURE_SLP_STMT (stmt_info))
11466             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
11467         }
11468
11469       for (gimple_stmt_iterator si = gsi_start_bb (bb);
11470            !gsi_end_p (si);)
11471         {
11472           stmt = gsi_stmt (si);
11473           /* During vectorization remove existing clobber stmts.  */
11474           if (gimple_clobber_p (stmt))
11475             {
11476               unlink_stmt_vdef (stmt);
11477               gsi_remove (&si, true);
11478               release_defs (stmt);
11479             }
11480           else
11481             {
11482               /* Ignore vector stmts created in the outer loop.  */
11483               stmt_info = loop_vinfo->lookup_stmt (stmt);
11484
11485               /* vector stmts created in the outer-loop during vectorization of
11486                  stmts in an inner-loop may not have a stmt_info, and do not
11487                  need to be vectorized.  */
11488               stmt_vec_info seen_store = NULL;
11489               if (stmt_info)
11490                 {
11491                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
11492                     {
11493                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
11494                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
11495                            !gsi_end_p (subsi); gsi_next (&subsi))
11496                         {
11497                           stmt_vec_info pat_stmt_info
11498                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
11499                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11500                                                     &si, &seen_store);
11501                         }
11502                       stmt_vec_info pat_stmt_info
11503                         = STMT_VINFO_RELATED_STMT (stmt_info);
11504                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11505                                                     &si, &seen_store))
11506                         maybe_set_vectorized_backedge_value (loop_vinfo,
11507                                                              pat_stmt_info);
11508                     }
11509                   else
11510                     {
11511                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
11512                                                     &seen_store))
11513                         maybe_set_vectorized_backedge_value (loop_vinfo,
11514                                                              stmt_info);
11515                     }
11516                 }
11517               gsi_next (&si);
11518               if (seen_store)
11519                 {
11520                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
11521                     /* Interleaving.  If IS_STORE is TRUE, the
11522                        vectorization of the interleaving chain was
11523                        completed - free all the stores in the chain.  */
11524                     vect_remove_stores (loop_vinfo,
11525                                         DR_GROUP_FIRST_ELEMENT (seen_store));
11526                   else
11527                     /* Free the attached stmt_vec_info and remove the stmt.  */
11528                     loop_vinfo->remove_stmt (stmt_info);
11529                 }
11530             }
11531         }
11532
11533       /* Stub out scalar statements that must not survive vectorization.
11534          Doing this here helps with grouped statements, or statements that
11535          are involved in patterns.  */
11536       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11537            !gsi_end_p (gsi); gsi_next (&gsi))
11538         {
11539           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11540           if (!call || !gimple_call_internal_p (call))
11541             continue;
11542           internal_fn ifn = gimple_call_internal_fn (call);
11543           if (ifn == IFN_MASK_LOAD)
11544             {
11545               tree lhs = gimple_get_lhs (call);
11546               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11547                 {
11548                   tree zero = build_zero_cst (TREE_TYPE (lhs));
11549                   gimple *new_stmt = gimple_build_assign (lhs, zero);
11550                   gsi_replace (&gsi, new_stmt, true);
11551                 }
11552             }
11553           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11554             {
11555               tree lhs = gimple_get_lhs (call);
11556               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11557                 {
11558                   tree else_arg
11559                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11560                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11561                   gsi_replace (&gsi, new_stmt, true);
11562                 }
11563             }
11564         }
11565     }                           /* BBs in loop */
11566
11567   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11568      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
11569   if (integer_onep (step_vector))
11570     niters_no_overflow = true;
11571   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
11572                            niters_vector_mult_vf, !niters_no_overflow);
11573
11574   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11575
11576   /* True if the final iteration might not handle a full vector's
11577      worth of scalar iterations.  */
11578   bool final_iter_may_be_partial
11579     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11580   /* The minimum number of iterations performed by the epilogue.  This
11581      is 1 when peeling for gaps because we always need a final scalar
11582      iteration.  */
11583   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11584   /* +1 to convert latch counts to loop iteration counts,
11585      -min_epilogue_iters to remove iterations that cannot be performed
11586        by the vector code.  */
11587   int bias_for_lowest = 1 - min_epilogue_iters;
11588   int bias_for_assumed = bias_for_lowest;
11589   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11590   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11591     {
11592       /* When the amount of peeling is known at compile time, the first
11593          iteration will have exactly alignment_npeels active elements.
11594          In the worst case it will have at least one.  */
11595       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11596       bias_for_lowest += lowest_vf - min_first_active;
11597       bias_for_assumed += assumed_vf - min_first_active;
11598     }
11599   /* In these calculations the "- 1" converts loop iteration counts
11600      back to latch counts.  */
11601   if (loop->any_upper_bound)
11602     {
11603       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11604       loop->nb_iterations_upper_bound
11605         = (final_iter_may_be_partial
11606            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11607                             lowest_vf) - 1
11608            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11609                              lowest_vf) - 1);
11610       if (main_vinfo
11611           /* Both peeling for alignment and peeling for gaps can end up
11612              with the scalar epilogue running for more than VF-1 iterations.  */
11613           && !main_vinfo->peeling_for_alignment
11614           && !main_vinfo->peeling_for_gaps)
11615         {
11616           unsigned int bound;
11617           poly_uint64 main_iters
11618             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11619                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11620           main_iters
11621             = upper_bound (main_iters,
11622                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11623           if (can_div_away_from_zero_p (main_iters,
11624                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11625                                         &bound))
11626             loop->nb_iterations_upper_bound
11627               = wi::umin ((widest_int) (bound - 1),
11628                           loop->nb_iterations_upper_bound);
11629       }
11630   }
11631   if (loop->any_likely_upper_bound)
11632     loop->nb_iterations_likely_upper_bound
11633       = (final_iter_may_be_partial
11634          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11635                           + bias_for_lowest, lowest_vf) - 1
11636          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11637                            + bias_for_lowest, lowest_vf) - 1);
11638   if (loop->any_estimate)
11639     loop->nb_iterations_estimate
11640       = (final_iter_may_be_partial
11641          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11642                           assumed_vf) - 1
11643          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11644                            assumed_vf) - 1);
11645   scale_profile_for_vect_loop (loop, assumed_vf, flat);
11646
11647   if (dump_enabled_p ())
11648     {
11649       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11650         {
11651           dump_printf_loc (MSG_NOTE, vect_location,
11652                            "LOOP VECTORIZED\n");
11653           if (loop->inner)
11654             dump_printf_loc (MSG_NOTE, vect_location,
11655                              "OUTER LOOP VECTORIZED\n");
11656           dump_printf (MSG_NOTE, "\n");
11657         }
11658       else
11659         dump_printf_loc (MSG_NOTE, vect_location,
11660                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11661                          GET_MODE_NAME (loop_vinfo->vector_mode));
11662     }
11663
11664   /* Loops vectorized with a variable factor won't benefit from
11665      unrolling/peeling.  */
11666   if (!vf.is_constant ())
11667     {
11668       loop->unroll = 1;
11669       if (dump_enabled_p ())
11670         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11671                          " variable-length vectorization factor\n");
11672     }
11673   /* Free SLP instances here because otherwise stmt reference counting
11674      won't work.  */
11675   slp_instance instance;
11676   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11677     vect_free_slp_instance (instance);
11678   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11679   /* Clear-up safelen field since its value is invalid after vectorization
11680      since vectorized loop can have loop-carried dependencies.  */
11681   loop->safelen = 0;
11682
11683   if (epilogue)
11684     {
11685       update_epilogue_loop_vinfo (epilogue, advance);
11686
11687       epilogue->simduid = loop->simduid;
11688       epilogue->force_vectorize = loop->force_vectorize;
11689       epilogue->dont_vectorize = false;
11690     }
11691
11692   return epilogue;
11693 }
11694
11695 /* The code below is trying to perform simple optimization - revert
11696    if-conversion for masked stores, i.e. if the mask of a store is zero
11697    do not perform it and all stored value producers also if possible.
11698    For example,
11699      for (i=0; i<n; i++)
11700        if (c[i])
11701         {
11702           p1[i] += 1;
11703           p2[i] = p3[i] +2;
11704         }
11705    this transformation will produce the following semi-hammock:
11706
11707    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11708      {
11709        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11710        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11711        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11712        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11713        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11714        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11715      }
11716 */
11717
11718 void
11719 optimize_mask_stores (class loop *loop)
11720 {
11721   basic_block *bbs = get_loop_body (loop);
11722   unsigned nbbs = loop->num_nodes;
11723   unsigned i;
11724   basic_block bb;
11725   class loop *bb_loop;
11726   gimple_stmt_iterator gsi;
11727   gimple *stmt;
11728   auto_vec<gimple *> worklist;
11729   auto_purge_vect_location sentinel;
11730
11731   vect_location = find_loop_location (loop);
11732   /* Pick up all masked stores in loop if any.  */
11733   for (i = 0; i < nbbs; i++)
11734     {
11735       bb = bbs[i];
11736       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11737            gsi_next (&gsi))
11738         {
11739           stmt = gsi_stmt (gsi);
11740           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11741             worklist.safe_push (stmt);
11742         }
11743     }
11744
11745   free (bbs);
11746   if (worklist.is_empty ())
11747     return;
11748
11749   /* Loop has masked stores.  */
11750   while (!worklist.is_empty ())
11751     {
11752       gimple *last, *last_store;
11753       edge e, efalse;
11754       tree mask;
11755       basic_block store_bb, join_bb;
11756       gimple_stmt_iterator gsi_to;
11757       tree vdef, new_vdef;
11758       gphi *phi;
11759       tree vectype;
11760       tree zero;
11761
11762       last = worklist.pop ();
11763       mask = gimple_call_arg (last, 2);
11764       bb = gimple_bb (last);
11765       /* Create then_bb and if-then structure in CFG, then_bb belongs to
11766          the same loop as if_bb.  It could be different to LOOP when two
11767          level loop-nest is vectorized and mask_store belongs to the inner
11768          one.  */
11769       e = split_block (bb, last);
11770       bb_loop = bb->loop_father;
11771       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11772       join_bb = e->dest;
11773       store_bb = create_empty_bb (bb);
11774       add_bb_to_loop (store_bb, bb_loop);
11775       e->flags = EDGE_TRUE_VALUE;
11776       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11777       /* Put STORE_BB to likely part.  */
11778       efalse->probability = profile_probability::likely ();
11779       e->probability = efalse->probability.invert ();
11780       store_bb->count = efalse->count ();
11781       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11782       if (dom_info_available_p (CDI_DOMINATORS))
11783         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11784       if (dump_enabled_p ())
11785         dump_printf_loc (MSG_NOTE, vect_location,
11786                          "Create new block %d to sink mask stores.",
11787                          store_bb->index);
11788       /* Create vector comparison with boolean result.  */
11789       vectype = TREE_TYPE (mask);
11790       zero = build_zero_cst (vectype);
11791       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11792       gsi = gsi_last_bb (bb);
11793       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11794       /* Create new PHI node for vdef of the last masked store:
11795          .MEM_2 = VDEF <.MEM_1>
11796          will be converted to
11797          .MEM.3 = VDEF <.MEM_1>
11798          and new PHI node will be created in join bb
11799          .MEM_2 = PHI <.MEM_1, .MEM_3>
11800       */
11801       vdef = gimple_vdef (last);
11802       new_vdef = make_ssa_name (gimple_vop (cfun), last);
11803       gimple_set_vdef (last, new_vdef);
11804       phi = create_phi_node (vdef, join_bb);
11805       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11806
11807       /* Put all masked stores with the same mask to STORE_BB if possible.  */
11808       while (true)
11809         {
11810           gimple_stmt_iterator gsi_from;
11811           gimple *stmt1 = NULL;
11812
11813           /* Move masked store to STORE_BB.  */
11814           last_store = last;
11815           gsi = gsi_for_stmt (last);
11816           gsi_from = gsi;
11817           /* Shift GSI to the previous stmt for further traversal.  */
11818           gsi_prev (&gsi);
11819           gsi_to = gsi_start_bb (store_bb);
11820           gsi_move_before (&gsi_from, &gsi_to);
11821           /* Setup GSI_TO to the non-empty block start.  */
11822           gsi_to = gsi_start_bb (store_bb);
11823           if (dump_enabled_p ())
11824             dump_printf_loc (MSG_NOTE, vect_location,
11825                              "Move stmt to created bb\n%G", last);
11826           /* Move all stored value producers if possible.  */
11827           while (!gsi_end_p (gsi))
11828             {
11829               tree lhs;
11830               imm_use_iterator imm_iter;
11831               use_operand_p use_p;
11832               bool res;
11833
11834               /* Skip debug statements.  */
11835               if (is_gimple_debug (gsi_stmt (gsi)))
11836                 {
11837                   gsi_prev (&gsi);
11838                   continue;
11839                 }
11840               stmt1 = gsi_stmt (gsi);
11841               /* Do not consider statements writing to memory or having
11842                  volatile operand.  */
11843               if (gimple_vdef (stmt1)
11844                   || gimple_has_volatile_ops (stmt1))
11845                 break;
11846               gsi_from = gsi;
11847               gsi_prev (&gsi);
11848               lhs = gimple_get_lhs (stmt1);
11849               if (!lhs)
11850                 break;
11851
11852               /* LHS of vectorized stmt must be SSA_NAME.  */
11853               if (TREE_CODE (lhs) != SSA_NAME)
11854                 break;
11855
11856               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11857                 {
11858                   /* Remove dead scalar statement.  */
11859                   if (has_zero_uses (lhs))
11860                     {
11861                       gsi_remove (&gsi_from, true);
11862                       continue;
11863                     }
11864                 }
11865
11866               /* Check that LHS does not have uses outside of STORE_BB.  */
11867               res = true;
11868               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11869                 {
11870                   gimple *use_stmt;
11871                   use_stmt = USE_STMT (use_p);
11872                   if (is_gimple_debug (use_stmt))
11873                     continue;
11874                   if (gimple_bb (use_stmt) != store_bb)
11875                     {
11876                       res = false;
11877                       break;
11878                     }
11879                 }
11880               if (!res)
11881                 break;
11882
11883               if (gimple_vuse (stmt1)
11884                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
11885                 break;
11886
11887               /* Can move STMT1 to STORE_BB.  */
11888               if (dump_enabled_p ())
11889                 dump_printf_loc (MSG_NOTE, vect_location,
11890                                  "Move stmt to created bb\n%G", stmt1);
11891               gsi_move_before (&gsi_from, &gsi_to);
11892               /* Shift GSI_TO for further insertion.  */
11893               gsi_prev (&gsi_to);
11894             }
11895           /* Put other masked stores with the same mask to STORE_BB.  */
11896           if (worklist.is_empty ()
11897               || gimple_call_arg (worklist.last (), 2) != mask
11898               || worklist.last () != stmt1)
11899             break;
11900           last = worklist.pop ();
11901         }
11902       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11903     }
11904 }
11905
11906 /* Decide whether it is possible to use a zero-based induction variable
11907    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
11908    the value that the induction variable must be able to hold in order
11909    to ensure that the rgroups eventually have no active vector elements.
11910    Return -1 otherwise.  */
11911
11912 widest_int
11913 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11914 {
11915   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11916   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11917   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11918
11919   /* Calculate the value that the induction variable must be able
11920      to hit in order to ensure that we end the loop with an all-false mask.
11921      This involves adding the maximum number of inactive trailing scalar
11922      iterations.  */
11923   widest_int iv_limit = -1;
11924   if (max_loop_iterations (loop, &iv_limit))
11925     {
11926       if (niters_skip)
11927         {
11928           /* Add the maximum number of skipped iterations to the
11929              maximum iteration count.  */
11930           if (TREE_CODE (niters_skip) == INTEGER_CST)
11931             iv_limit += wi::to_widest (niters_skip);
11932           else
11933             iv_limit += max_vf - 1;
11934         }
11935       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11936         /* Make a conservatively-correct assumption.  */
11937         iv_limit += max_vf - 1;
11938
11939       /* IV_LIMIT is the maximum number of latch iterations, which is also
11940          the maximum in-range IV value.  Round this value down to the previous
11941          vector alignment boundary and then add an extra full iteration.  */
11942       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11943       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
11944     }
11945   return iv_limit;
11946 }
11947
11948 /* For the given rgroup_controls RGC, check whether an induction variable
11949    would ever hit a value that produces a set of all-false masks or zero
11950    lengths before wrapping around.  Return true if it's possible to wrap
11951    around before hitting the desirable value, otherwise return false.  */
11952
11953 bool
11954 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
11955 {
11956   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
11957
11958   if (iv_limit == -1)
11959     return true;
11960
11961   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11962   unsigned int compare_precision = TYPE_PRECISION (compare_type);
11963   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
11964
11965   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
11966     return true;
11967
11968   return false;
11969 }