gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "diagnostic-core.h"
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "cfganal.h"
  39 #include "gimplify.h"
  40 #include "gimple-iterator.h"
  41 #include "gimplify-me.h"
  42 #include "tree-ssa-loop-ivopts.h"
  43 #include "tree-ssa-loop-manip.h"
  44 #include "tree-ssa-loop-niter.h"
  45 #include "tree-ssa-loop.h"
  46 #include "cfgloop.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57 #include "case-cfn-macros.h"
  58 #include "langhooks.h"
  59
  60 /* Loop Vectorization Pass.
  61
  62    This pass tries to vectorize loops.
  63
  64    For example, the vectorizer transforms the following simple loop:
  65
  66         short a[N]; short b[N]; short c[N]; int i;
  67
  68         for (i=0; i<N; i++){
  69           a[i] = b[i] + c[i];
  70         }
  71
  72    as if it was manually vectorized by rewriting the source code into:
  73
  74         typedef int __attribute__((mode(V8HI))) v8hi;
  75         short a[N];  short b[N]; short c[N];   int i;
  76         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  77         v8hi va, vb, vc;
  78
  79         for (i=0; i<N/8; i++){
  80           vb = pb[i];
  81           vc = pc[i];
  82           va = vb + vc;
  83           pa[i] = va;
  84         }
  85
  86         The main entry to this pass is vectorize_loops(), in which
  87    the vectorizer applies a set of analyses on a given set of loops,
  88    followed by the actual vectorization transformation for the loops that
  89    had successfully passed the analysis phase.
  90         Throughout this pass we make a distinction between two types of
  91    data: scalars (which are represented by SSA_NAMES), and memory references
  92    ("data-refs").  These two types of data require different handling both
  93    during analysis and transformation. The types of data-refs that the
  94    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  95    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  96    accesses are required to have a simple (consecutive) access pattern.
  97
  98    Analysis phase:
  99    ===============
 100         The driver for the analysis phase is vect_analyze_loop().
 101    It applies a set of analyses, some of which rely on the scalar evolution
 102    analyzer (scev) developed by Sebastian Pop.
 103
 104         During the analysis phase the vectorizer records some information
 105    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 106    loop, as well as general information about the loop as a whole, which is
 107    recorded in a "loop_vec_info" struct attached to each loop.
 108
 109    Transformation phase:
 110    =====================
 111         The loop transformation phase scans all the stmts in the loop, and
 112    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 113    the loop that needs to be vectorized.  It inserts the vector code sequence
 114    just before the scalar stmt S, and records a pointer to the vector code
 115    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 116    attached to S).  This pointer will be used for the vectorization of following
 117    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 118    otherwise, we rely on dead code elimination for removing it.
 119
 120         For example, say stmt S1 was vectorized into stmt VS1:
 121
 122    VS1: vb = px[i];
 123    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 124    S2:  a = b;
 125
 126    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 127    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 128    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 129    resulting sequence would be:
 130
 131    VS1: vb = px[i];
 132    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 133    VS2: va = vb;
 134    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 135
 136         Operands that are not SSA_NAMEs, are data-refs that appear in
 137    load/store operations (like 'x[i]' in S1), and are handled differently.
 138
 139    Target modeling:
 140    =================
 141         Currently the only target specific information that is used is the
 142    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 143    Targets that can support different sizes of vectors, for now will need
 144    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 145    flexibility will be added in the future.
 146
 147         Since we only vectorize operations which vector form can be
 148    expressed using existing tree codes, to verify that an operation is
 149    supported, the vectorizer checks the relevant optab at the relevant
 150    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 151    the value found is CODE_FOR_nothing, then there's no target support, and
 152    we can't vectorize the stmt.
 153
 154    For additional information on this project see:
 155    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 156 */
 157
 158 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 159                                                 unsigned *);
 160 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 161                                                bool *, bool *, bool);
 162
 163 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 164    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 165    may already be set for general statements (not just data refs).  */
 166
 167 static opt_result
 168 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 169                               bool vectype_maybe_set_p,
 170                               poly_uint64 *vf)
 171 {
 172   gimple *stmt = stmt_info->stmt;
 173
 174   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 175        && !STMT_VINFO_LIVE_P (stmt_info))
 176       || gimple_clobber_p (stmt))
 177     {
 178       if (dump_enabled_p ())
 179         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 180       return opt_result::success ();
 181     }
 182
 183   tree stmt_vectype, nunits_vectype;
 184   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 185                                                    &stmt_vectype,
 186                                                    &nunits_vectype);
 187   if (!res)
 188     return res;
 189
 190   if (stmt_vectype)
 191     {
 192       if (STMT_VINFO_VECTYPE (stmt_info))
 193         /* The only case when a vectype had been already set is for stmts
 194            that contain a data ref, or for "pattern-stmts" (stmts generated
 195            by the vectorizer to represent/replace a certain idiom).  */
 196         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 197                      || vectype_maybe_set_p)
 198                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 199       else
 200         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 201     }
 202
 203   if (nunits_vectype)
 204     vect_update_max_nunits (vf, nunits_vectype);
 205
 206   return opt_result::success ();
 207 }
 208
 209 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 210    types of STMT_INFO and all attached pattern statements and update
 211    the vectorization factor VF accordingly.  Return true on success
 212    or false if something prevented vectorization.  */
 213
 214 static opt_result
 215 vect_determine_vf_for_stmt (vec_info *vinfo,
 216                             stmt_vec_info stmt_info, poly_uint64 *vf)
 217 {
 218   if (dump_enabled_p ())
 219     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 220                      stmt_info->stmt);
 221   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 222   if (!res)
 223     return res;
 224
 225   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 226       && STMT_VINFO_RELATED_STMT (stmt_info))
 227     {
 228       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 229       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 230
 231       /* If a pattern statement has def stmts, analyze them too.  */
 232       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 233            !gsi_end_p (si); gsi_next (&si))
 234         {
 235           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 236           if (dump_enabled_p ())
 237             dump_printf_loc (MSG_NOTE, vect_location,
 238                              "==> examining pattern def stmt: %G",
 239                              def_stmt_info->stmt);
 240           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 241           if (!res)
 242             return res;
 243         }
 244
 245       if (dump_enabled_p ())
 246         dump_printf_loc (MSG_NOTE, vect_location,
 247                          "==> examining pattern statement: %G",
 248                          stmt_info->stmt);
 249       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 250       if (!res)
 251         return res;
 252     }
 253
 254   return opt_result::success ();
 255 }
 256
 257 /* Function vect_determine_vectorization_factor
 258
 259    Determine the vectorization factor (VF).  VF is the number of data elements
 260    that are operated upon in parallel in a single iteration of the vectorized
 261    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 262    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 263    elements can fit in a single vector register.
 264
 265    We currently support vectorization of loops in which all types operated upon
 266    are of the same size.  Therefore this function currently sets VF according to
 267    the size of the types operated upon, and fails if there are multiple sizes
 268    in the loop.
 269
 270    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 271    original loop:
 272         for (i=0; i<N; i++){
 273           a[i] = b[i] + c[i];
 274         }
 275
 276    vectorized loop:
 277         for (i=0; i<N; i+=VF){
 278           a[i:VF] = b[i:VF] + c[i:VF];
 279         }
 280 */
 281
 282 static opt_result
 283 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 284 {
 285   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 286   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 287   unsigned nbbs = loop->num_nodes;
 288   poly_uint64 vectorization_factor = 1;
 289   tree scalar_type = NULL_TREE;
 290   gphi *phi;
 291   tree vectype;
 292   stmt_vec_info stmt_info;
 293   unsigned i;
 294
 295   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 296
 297   for (i = 0; i < nbbs; i++)
 298     {
 299       basic_block bb = bbs[i];
 300
 301       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 302            gsi_next (&si))
 303         {
 304           phi = si.phi ();
 305           stmt_info = loop_vinfo->lookup_stmt (phi);
 306           if (dump_enabled_p ())
 307             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 308                              (gimple *) phi);
 309
 310           gcc_assert (stmt_info);
 311
 312           if (STMT_VINFO_RELEVANT_P (stmt_info)
 313               || STMT_VINFO_LIVE_P (stmt_info))
 314             {
 315               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 316               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 317
 318               if (dump_enabled_p ())
 319                 dump_printf_loc (MSG_NOTE, vect_location,
 320                                  "get vectype for scalar type:  %T\n",
 321                                  scalar_type);
 322
 323               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 324               if (!vectype)
 325                 return opt_result::failure_at (phi,
 326                                                "not vectorized: unsupported "
 327                                                "data-type %T\n",
 328                                                scalar_type);
 329               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 330
 331               if (dump_enabled_p ())
 332                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 333                                  vectype);
 334
 335               if (dump_enabled_p ())
 336                 {
 337                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 338                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 339                   dump_printf (MSG_NOTE, "\n");
 340                 }
 341
 342               vect_update_max_nunits (&vectorization_factor, vectype);
 343             }
 344         }
 345
 346       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 347            gsi_next (&si))
 348         {
 349           if (is_gimple_debug (gsi_stmt (si)))
 350             continue;
 351           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 352           opt_result res
 353             = vect_determine_vf_for_stmt (loop_vinfo,
 354                                           stmt_info, &vectorization_factor);
 355           if (!res)
 356             return res;
 357         }
 358     }
 359
 360   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 361   if (dump_enabled_p ())
 362     {
 363       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 364       dump_dec (MSG_NOTE, vectorization_factor);
 365       dump_printf (MSG_NOTE, "\n");
 366     }
 367
 368   if (known_le (vectorization_factor, 1U))
 369     return opt_result::failure_at (vect_location,
 370                                    "not vectorized: unsupported data-type\n");
 371   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 372   return opt_result::success ();
 373 }
 374
 375
 376 /* Function vect_is_simple_iv_evolution.
 377
 378    FORNOW: A simple evolution of an induction variables in the loop is
 379    considered a polynomial evolution.  */
 380
 381 static bool
 382 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 383                              tree * step)
 384 {
 385   tree init_expr;
 386   tree step_expr;
 387   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 388   basic_block bb;
 389
 390   /* When there is no evolution in this loop, the evolution function
 391      is not "simple".  */
 392   if (evolution_part == NULL_TREE)
 393     return false;
 394
 395   /* When the evolution is a polynomial of degree >= 2
 396      the evolution function is not "simple".  */
 397   if (tree_is_chrec (evolution_part))
 398     return false;
 399
 400   step_expr = evolution_part;
 401   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 402
 403   if (dump_enabled_p ())
 404     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 405                      step_expr, init_expr);
 406
 407   *init = init_expr;
 408   *step = step_expr;
 409
 410   if (TREE_CODE (step_expr) != INTEGER_CST
 411       && (TREE_CODE (step_expr) != SSA_NAME
 412           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 413               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 414           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 415               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 416                   || !flag_associative_math)))
 417       && (TREE_CODE (step_expr) != REAL_CST
 418           || !flag_associative_math))
 419     {
 420       if (dump_enabled_p ())
 421         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 422                          "step unknown.\n");
 423       return false;
 424     }
 425
 426   return true;
 427 }
 428
 429 /* Function vect_is_nonlinear_iv_evolution
 430
 431    Only support nonlinear induction for integer type
 432    1. neg
 433    2. mul by constant
 434    3. lshift/rshift by constant.
 435
 436    For neg induction, return a fake step as integer -1.  */
 437 static bool
 438 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
 439                                 gphi* loop_phi_node, tree *init, tree *step)
 440 {
 441   tree init_expr, ev_expr, result, op1, op2;
 442   gimple* def;
 443
 444   if (gimple_phi_num_args (loop_phi_node) != 2)
 445     return false;
 446
 447   init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
 448   ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
 449
 450   /* Support nonlinear induction only for integer type.  */
 451   if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
 452     return false;
 453
 454   *init = init_expr;
 455   result = PHI_RESULT (loop_phi_node);
 456
 457   if (TREE_CODE (ev_expr) != SSA_NAME
 458       || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
 459       || !is_gimple_assign (def))
 460     return false;
 461
 462   enum tree_code t_code = gimple_assign_rhs_code (def);
 463   switch (t_code)
 464     {
 465     case NEGATE_EXPR:
 466       if (gimple_assign_rhs1 (def) != result)
 467         return false;
 468       *step = build_int_cst (TREE_TYPE (init_expr), -1);
 469       STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
 470       break;
 471
 472     case RSHIFT_EXPR:
 473     case LSHIFT_EXPR:
 474     case MULT_EXPR:
 475       op1 = gimple_assign_rhs1 (def);
 476       op2 = gimple_assign_rhs2 (def);
 477       if (TREE_CODE (op2) != INTEGER_CST
 478           || op1 != result)
 479         return false;
 480       *step = op2;
 481       if (t_code == LSHIFT_EXPR)
 482         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
 483       else if (t_code == RSHIFT_EXPR)
 484         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
 485       /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul.  */
 486       else
 487         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
 488       break;
 489
 490     default:
 491       return false;
 492     }
 493
 494   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
 495   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
 496
 497   return true;
 498 }
 499
 500 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 501    what we are assuming is a double reduction.  For example, given
 502    a structure like this:
 503
 504       outer1:
 505         x_1 = PHI <x_4(outer2), ...>;
 506         ...
 507
 508       inner:
 509         x_2 = PHI <x_1(outer1), ...>;
 510         ...
 511         x_3 = ...;
 512         ...
 513
 514       outer2:
 515         x_4 = PHI <x_3(inner)>;
 516         ...
 517
 518    outer loop analysis would treat x_1 as a double reduction phi and
 519    this function would then return true for x_2.  */
 520
 521 static bool
 522 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 523 {
 524   use_operand_p use_p;
 525   ssa_op_iter op_iter;
 526   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 527     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 528       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 529         return true;
 530   return false;
 531 }
 532
 533 /* Returns true if Phi is a first-order recurrence. A first-order
 534    recurrence is a non-reduction recurrence relation in which the value of
 535    the recurrence in the current loop iteration equals a value defined in
 536    the previous iteration.  */
 537
 538 static bool
 539 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
 540                                    gphi *phi)
 541 {
 542   /* A nested cycle isn't vectorizable as first order recurrence.  */
 543   if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
 544     return false;
 545
 546   /* Ensure the loop latch definition is from within the loop.  */
 547   edge latch = loop_latch_edge (loop);
 548   tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
 549   if (TREE_CODE (ldef) != SSA_NAME
 550       || SSA_NAME_IS_DEFAULT_DEF (ldef)
 551       || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
 552       || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
 553     return false;
 554
 555   tree def = gimple_phi_result (phi);
 556
 557   /* Ensure every use_stmt of the phi node is dominated by the latch
 558      definition.  */
 559   imm_use_iterator imm_iter;
 560   use_operand_p use_p;
 561   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
 562     if (!is_gimple_debug (USE_STMT (use_p))
 563         && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
 564             || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
 565                                             USE_STMT (use_p))))
 566       return false;
 567
 568   /* First-order recurrence autovectorization needs shuffle vector.  */
 569   tree scalar_type = TREE_TYPE (def);
 570   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 571   if (!vectype)
 572     return false;
 573
 574   return true;
 575 }
 576
 577 /* Function vect_analyze_scalar_cycles_1.
 578
 579    Examine the cross iteration def-use cycles of scalar variables
 580    in LOOP.  LOOP_VINFO represents the loop that is now being
 581    considered for vectorization (can be LOOP, or an outer-loop
 582    enclosing LOOP).  SLP indicates there will be some subsequent
 583    slp analyses or not.  */
 584
 585 static void
 586 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
 587                               bool slp)
 588 {
 589   basic_block bb = loop->header;
 590   tree init, step;
 591   auto_vec<stmt_vec_info, 64> worklist;
 592   gphi_iterator gsi;
 593   bool double_reduc, reduc_chain;
 594
 595   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 596
 597   /* First - identify all inductions.  Reduction detection assumes that all the
 598      inductions have been identified, therefore, this order must not be
 599      changed.  */
 600   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 601     {
 602       gphi *phi = gsi.phi ();
 603       tree access_fn = NULL;
 604       tree def = PHI_RESULT (phi);
 605       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 606
 607       if (dump_enabled_p ())
 608         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 609                          (gimple *) phi);
 610
 611       /* Skip virtual phi's.  The data dependences that are associated with
 612          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 613       if (virtual_operand_p (def))
 614         continue;
 615
 616       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 617
 618       /* Analyze the evolution function.  */
 619       access_fn = analyze_scalar_evolution (loop, def);
 620       if (access_fn)
 621         {
 622           STRIP_NOPS (access_fn);
 623           if (dump_enabled_p ())
 624             dump_printf_loc (MSG_NOTE, vect_location,
 625                              "Access function of PHI: %T\n", access_fn);
 626           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 627             = initial_condition_in_loop_num (access_fn, loop->num);
 628           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 629             = evolution_part_in_loop_num (access_fn, loop->num);
 630         }
 631
 632       if ((!access_fn
 633            || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 634            || !vect_is_simple_iv_evolution (loop->num, access_fn,
 635                                             &init, &step)
 636            || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 637                && TREE_CODE (step) != INTEGER_CST))
 638           /* Only handle nonlinear iv for same loop.  */
 639           && (LOOP_VINFO_LOOP (loop_vinfo) != loop
 640               || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
 641                                                   phi, &init, &step)))
 642         {
 643           worklist.safe_push (stmt_vinfo);
 644           continue;
 645         }
 646
 647       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 648                   != NULL_TREE);
 649       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 650
 651       if (dump_enabled_p ())
 652         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 653       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 654     }
 655
 656
 657   /* Second - identify all reductions and nested cycles.  */
 658   while (worklist.length () > 0)
 659     {
 660       stmt_vec_info stmt_vinfo = worklist.pop ();
 661       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 662       tree def = PHI_RESULT (phi);
 663
 664       if (dump_enabled_p ())
 665         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 666                          (gimple *) phi);
 667
 668       gcc_assert (!virtual_operand_p (def)
 669                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 670
 671       stmt_vec_info reduc_stmt_info
 672         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 673                                     &reduc_chain, slp);
 674       if (reduc_stmt_info)
 675         {
 676           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 677           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 678           if (double_reduc)
 679             {
 680               if (dump_enabled_p ())
 681                 dump_printf_loc (MSG_NOTE, vect_location,
 682                                  "Detected double reduction.\n");
 683
 684               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 685               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 686             }
 687           else
 688             {
 689               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 690                 {
 691                   if (dump_enabled_p ())
 692                     dump_printf_loc (MSG_NOTE, vect_location,
 693                                      "Detected vectorizable nested cycle.\n");
 694
 695                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 696                 }
 697               else
 698                 {
 699                   if (dump_enabled_p ())
 700                     dump_printf_loc (MSG_NOTE, vect_location,
 701                                      "Detected reduction.\n");
 702
 703                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 704                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 705                   /* Store the reduction cycles for possible vectorization in
 706                      loop-aware SLP if it was not detected as reduction
 707                      chain.  */
 708                   if (! reduc_chain)
 709                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 710                       (reduc_stmt_info);
 711                 }
 712             }
 713         }
 714       else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
 715         STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
 716       else
 717         if (dump_enabled_p ())
 718           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 719                            "Unknown def-use cycle pattern.\n");
 720     }
 721 }
 722
 723
 724 /* Function vect_analyze_scalar_cycles.
 725
 726    Examine the cross iteration def-use cycles of scalar variables, by
 727    analyzing the loop-header PHIs of scalar variables.  Classify each
 728    cycle as one of the following: invariant, induction, reduction, unknown.
 729    We do that for the loop represented by LOOP_VINFO, and also to its
 730    inner-loop, if exists.
 731    Examples for scalar cycles:
 732
 733    Example1: reduction:
 734
 735               loop1:
 736               for (i=0; i<N; i++)
 737                  sum += a[i];
 738
 739    Example2: induction:
 740
 741               loop2:
 742               for (i=0; i<N; i++)
 743                  a[i] = i;  */
 744
 745 static void
 746 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
 747 {
 748   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 749
 750   vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
 751
 752   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 753      Reductions in such inner-loop therefore have different properties than
 754      the reductions in the nest that gets vectorized:
 755      1. When vectorized, they are executed in the same order as in the original
 756         scalar loop, so we can't change the order of computation when
 757         vectorizing them.
 758      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 759         current checks are too strict.  */
 760
 761   if (loop->inner)
 762     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
 763 }
 764
 765 /* Transfer group and reduction information from STMT_INFO to its
 766    pattern stmt.  */
 767
 768 static void
 769 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 770 {
 771   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 772   stmt_vec_info stmtp;
 773   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 774               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 775   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 776   do
 777     {
 778       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 779       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 780                            == STMT_VINFO_DEF_TYPE (stmt_info));
 781       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 782       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 783       if (stmt_info)
 784         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 785           = STMT_VINFO_RELATED_STMT (stmt_info);
 786     }
 787   while (stmt_info);
 788 }
 789
 790 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 791
 792 static void
 793 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 794 {
 795   stmt_vec_info first;
 796   unsigned i;
 797
 798   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 799     {
 800       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 801       while (next)
 802         {
 803           if ((STMT_VINFO_IN_PATTERN_P (next)
 804                != STMT_VINFO_IN_PATTERN_P (first))
 805               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 806             break;
 807           next = REDUC_GROUP_NEXT_ELEMENT (next);
 808         }
 809       /* If all reduction chain members are well-formed patterns adjust
 810          the group to group the pattern stmts instead.  */
 811       if (! next
 812           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 813         {
 814           if (STMT_VINFO_IN_PATTERN_P (first))
 815             {
 816               vect_fixup_reduc_chain (first);
 817               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 818                 = STMT_VINFO_RELATED_STMT (first);
 819             }
 820         }
 821       /* If not all stmt in the chain are patterns or if we failed
 822          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 823          it as regular reduction instead.  */
 824       else
 825         {
 826           stmt_vec_info vinfo = first;
 827           stmt_vec_info last = NULL;
 828           while (vinfo)
 829             {
 830               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 831               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 832               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 833               last = vinfo;
 834               vinfo = next;
 835             }
 836           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 837             = vect_internal_def;
 838           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 839           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 840           --i;
 841         }
 842     }
 843 }
 844
 845 /* Function vect_get_loop_niters.
 846
 847    Determine how many iterations the loop is executed and place it
 848    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 849    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 850    niter information holds in ASSUMPTIONS.
 851
 852    Return the loop exit condition.  */
 853
 854
 855 static gcond *
 856 vect_get_loop_niters (class loop *loop, tree *assumptions,
 857                       tree *number_of_iterations, tree *number_of_iterationsm1)
 858 {
 859   edge exit = single_exit (loop);
 860   class tree_niter_desc niter_desc;
 861   tree niter_assumptions, niter, may_be_zero;
 862   gcond *cond = get_loop_exit_condition (loop);
 863
 864   *assumptions = boolean_true_node;
 865   *number_of_iterationsm1 = chrec_dont_know;
 866   *number_of_iterations = chrec_dont_know;
 867   DUMP_VECT_SCOPE ("get_loop_niters");
 868
 869   if (!exit)
 870     return cond;
 871
 872   may_be_zero = NULL_TREE;
 873   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 874       || chrec_contains_undetermined (niter_desc.niter))
 875     return cond;
 876
 877   niter_assumptions = niter_desc.assumptions;
 878   may_be_zero = niter_desc.may_be_zero;
 879   niter = niter_desc.niter;
 880
 881   if (may_be_zero && integer_zerop (may_be_zero))
 882     may_be_zero = NULL_TREE;
 883
 884   if (may_be_zero)
 885     {
 886       if (COMPARISON_CLASS_P (may_be_zero))
 887         {
 888           /* Try to combine may_be_zero with assumptions, this can simplify
 889              computation of niter expression.  */
 890           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 891             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 892                                              niter_assumptions,
 893                                              fold_build1 (TRUTH_NOT_EXPR,
 894                                                           boolean_type_node,
 895                                                           may_be_zero));
 896           else
 897             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 898                                  build_int_cst (TREE_TYPE (niter), 0),
 899                                  rewrite_to_non_trapping_overflow (niter));
 900
 901           may_be_zero = NULL_TREE;
 902         }
 903       else if (integer_nonzerop (may_be_zero))
 904         {
 905           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 906           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 907           return cond;
 908         }
 909       else
 910         return cond;
 911     }
 912
 913   *assumptions = niter_assumptions;
 914   *number_of_iterationsm1 = niter;
 915
 916   /* We want the number of loop header executions which is the number
 917      of latch executions plus one.
 918      ???  For UINT_MAX latch executions this number overflows to zero
 919      for loops like do { n++; } while (n != 0);  */
 920   if (niter && !chrec_contains_undetermined (niter))
 921     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 922                           build_int_cst (TREE_TYPE (niter), 1));
 923   *number_of_iterations = niter;
 924
 925   return cond;
 926 }
 927
 928 /* Function bb_in_loop_p
 929
 930    Used as predicate for dfs order traversal of the loop bbs.  */
 931
 932 static bool
 933 bb_in_loop_p (const_basic_block bb, const void *data)
 934 {
 935   const class loop *const loop = (const class loop *)data;
 936   if (flow_bb_inside_loop_p (loop, bb))
 937     return true;
 938   return false;
 939 }
 940
 941
 942 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 943    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 944
 945 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 946   : vec_info (vec_info::loop, shared),
 947     loop (loop_in),
 948     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 949     num_itersm1 (NULL_TREE),
 950     num_iters (NULL_TREE),
 951     num_iters_unchanged (NULL_TREE),
 952     num_iters_assumptions (NULL_TREE),
 953     vector_costs (nullptr),
 954     scalar_costs (nullptr),
 955     th (0),
 956     versioning_threshold (0),
 957     vectorization_factor (0),
 958     main_loop_edge (nullptr),
 959     skip_main_loop_edge (nullptr),
 960     skip_this_loop_edge (nullptr),
 961     reusable_accumulators (),
 962     suggested_unroll_factor (1),
 963     max_vectorization_factor (0),
 964     mask_skip_niters (NULL_TREE),
 965     rgroup_compare_type (NULL_TREE),
 966     simd_if_cond (NULL_TREE),
 967     partial_vector_style (vect_partial_vectors_none),
 968     unaligned_dr (NULL),
 969     peeling_for_alignment (0),
 970     ptr_mask (0),
 971     ivexpr_map (NULL),
 972     scan_map (NULL),
 973     slp_unrolling_factor (1),
 974     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
 975     vectorizable (false),
 976     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
 977     using_partial_vectors_p (false),
 978     using_decrementing_iv_p (false),
 979     using_select_vl_p (false),
 980     epil_using_partial_vectors_p (false),
 981     partial_load_store_bias (0),
 982     peeling_for_gaps (false),
 983     peeling_for_niter (false),
 984     no_data_dependencies (false),
 985     has_mask_store (false),
 986     scalar_loop_scaling (profile_probability::uninitialized ()),
 987     scalar_loop (NULL),
 988     orig_loop_info (NULL)
 989 {
 990   /* CHECKME: We want to visit all BBs before their successors (except for
 991      latch blocks, for which this assertion wouldn't hold).  In the simple
 992      case of the loop forms we allow, a dfs order of the BBs would the same
 993      as reversed postorder traversal, so we are safe.  */
 994
 995   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 996                                           bbs, loop->num_nodes, loop);
 997   gcc_assert (nbbs == loop->num_nodes);
 998
 999   for (unsigned int i = 0; i < nbbs; i++)
1000     {
1001       basic_block bb = bbs[i];
1002       gimple_stmt_iterator si;
1003
1004       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1005         {
1006           gimple *phi = gsi_stmt (si);
1007           gimple_set_uid (phi, 0);
1008           add_stmt (phi);
1009         }
1010
1011       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1012         {
1013           gimple *stmt = gsi_stmt (si);
1014           gimple_set_uid (stmt, 0);
1015           if (is_gimple_debug (stmt))
1016             continue;
1017           add_stmt (stmt);
1018           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1019              third argument is the #pragma omp simd if (x) condition, when 0,
1020              loop shouldn't be vectorized, when non-zero constant, it should
1021              be vectorized normally, otherwise versioned with vectorized loop
1022              done if the condition is non-zero at runtime.  */
1023           if (loop_in->simduid
1024               && is_gimple_call (stmt)
1025               && gimple_call_internal_p (stmt)
1026               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1027               && gimple_call_num_args (stmt) >= 3
1028               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1029               && (loop_in->simduid
1030                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1031             {
1032               tree arg = gimple_call_arg (stmt, 2);
1033               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1034                 simd_if_cond = arg;
1035               else
1036                 gcc_assert (integer_nonzerop (arg));
1037             }
1038         }
1039     }
1040
1041   epilogue_vinfos.create (6);
1042 }
1043
1044 /* Free all levels of rgroup CONTROLS.  */
1045
1046 void
1047 release_vec_loop_controls (vec<rgroup_controls> *controls)
1048 {
1049   rgroup_controls *rgc;
1050   unsigned int i;
1051   FOR_EACH_VEC_ELT (*controls, i, rgc)
1052     rgc->controls.release ();
1053   controls->release ();
1054 }
1055
1056 /* Free all memory used by the _loop_vec_info, as well as all the
1057    stmt_vec_info structs of all the stmts in the loop.  */
1058
1059 _loop_vec_info::~_loop_vec_info ()
1060 {
1061   free (bbs);
1062
1063   release_vec_loop_controls (&masks.rgc_vec);
1064   release_vec_loop_controls (&lens);
1065   delete ivexpr_map;
1066   delete scan_map;
1067   epilogue_vinfos.release ();
1068   delete scalar_costs;
1069   delete vector_costs;
1070
1071   /* When we release an epiloge vinfo that we do not intend to use
1072      avoid clearing AUX of the main loop which should continue to
1073      point to the main loop vinfo since otherwise we'll leak that.  */
1074   if (loop->aux == this)
1075     loop->aux = NULL;
1076 }
1077
1078 /* Return an invariant or register for EXPR and emit necessary
1079    computations in the LOOP_VINFO loop preheader.  */
1080
1081 tree
1082 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1083 {
1084   if (is_gimple_reg (expr)
1085       || is_gimple_min_invariant (expr))
1086     return expr;
1087
1088   if (! loop_vinfo->ivexpr_map)
1089     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1090   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1091   if (! cached)
1092     {
1093       gimple_seq stmts = NULL;
1094       cached = force_gimple_operand (unshare_expr (expr),
1095                                      &stmts, true, NULL_TREE);
1096       if (stmts)
1097         {
1098           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1099           gsi_insert_seq_on_edge_immediate (e, stmts);
1100         }
1101     }
1102   return cached;
1103 }
1104
1105 /* Return true if we can use CMP_TYPE as the comparison type to produce
1106    all masks required to mask LOOP_VINFO.  */
1107
1108 static bool
1109 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1110 {
1111   rgroup_controls *rgm;
1112   unsigned int i;
1113   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1114     if (rgm->type != NULL_TREE
1115         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1116                                             cmp_type, rgm->type,
1117                                             OPTIMIZE_FOR_SPEED))
1118       return false;
1119   return true;
1120 }
1121
1122 /* Calculate the maximum number of scalars per iteration for every
1123    rgroup in LOOP_VINFO.  */
1124
1125 static unsigned int
1126 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1127 {
1128   unsigned int res = 1;
1129   unsigned int i;
1130   rgroup_controls *rgm;
1131   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1132     res = MAX (res, rgm->max_nscalars_per_iter);
1133   return res;
1134 }
1135
1136 /* Calculate the minimum precision necessary to represent:
1137
1138       MAX_NITERS * FACTOR
1139
1140    as an unsigned integer, where MAX_NITERS is the maximum number of
1141    loop header iterations for the original scalar form of LOOP_VINFO.  */
1142
1143 static unsigned
1144 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1145 {
1146   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1147
1148   /* Get the maximum number of iterations that is representable
1149      in the counter type.  */
1150   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1151   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1152
1153   /* Get a more refined estimate for the number of iterations.  */
1154   widest_int max_back_edges;
1155   if (max_loop_iterations (loop, &max_back_edges))
1156     max_ni = wi::smin (max_ni, max_back_edges + 1);
1157
1158   /* Work out how many bits we need to represent the limit.  */
1159   return wi::min_precision (max_ni * factor, UNSIGNED);
1160 }
1161
1162 /* True if the loop needs peeling or partial vectors when vectorized.  */
1163
1164 static bool
1165 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1166 {
1167   unsigned HOST_WIDE_INT const_vf;
1168   HOST_WIDE_INT max_niter
1169     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1170
1171   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1172   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1173     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1174                                           (loop_vinfo));
1175
1176   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1177       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1178     {
1179       /* Work out the (constant) number of iterations that need to be
1180          peeled for reasons other than niters.  */
1181       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1182       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1183         peel_niter += 1;
1184       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1185                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1186         return true;
1187     }
1188   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1189       /* ??? When peeling for gaps but not alignment, we could
1190          try to check whether the (variable) niters is known to be
1191          VF * N + 1.  That's something of a niche case though.  */
1192       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1193       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1194       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1195            < (unsigned) exact_log2 (const_vf))
1196           /* In case of versioning, check if the maximum number of
1197              iterations is greater than th.  If they are identical,
1198              the epilogue is unnecessary.  */
1199           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1200               || ((unsigned HOST_WIDE_INT) max_niter
1201                   > (th / const_vf) * const_vf))))
1202     return true;
1203
1204   return false;
1205 }
1206
1207 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1208    whether we can actually generate the masks required.  Return true if so,
1209    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1210
1211 static bool
1212 vect_verify_full_masking (loop_vec_info loop_vinfo)
1213 {
1214   unsigned int min_ni_width;
1215
1216   /* Use a normal loop if there are no statements that need masking.
1217      This only happens in rare degenerate cases: it means that the loop
1218      has no loads, no stores, and no live-out values.  */
1219   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1220     return false;
1221
1222   /* Produce the rgroup controls.  */
1223   for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1224     {
1225       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1226       tree vectype = mask.first;
1227       unsigned nvectors = mask.second;
1228
1229       if (masks->rgc_vec.length () < nvectors)
1230         masks->rgc_vec.safe_grow_cleared (nvectors, true);
1231       rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1232       /* The number of scalars per iteration and the number of vectors are
1233          both compile-time constants.  */
1234       unsigned int nscalars_per_iter
1235           = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1236                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1237
1238       if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1239         {
1240           rgm->max_nscalars_per_iter = nscalars_per_iter;
1241           rgm->type = truth_type_for (vectype);
1242           rgm->factor = 1;
1243         }
1244     }
1245
1246   unsigned int max_nscalars_per_iter
1247     = vect_get_max_nscalars_per_iter (loop_vinfo);
1248
1249   /* Work out how many bits we need to represent the limit.  */
1250   min_ni_width
1251     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1252
1253   /* Find a scalar mode for which WHILE_ULT is supported.  */
1254   opt_scalar_int_mode cmp_mode_iter;
1255   tree cmp_type = NULL_TREE;
1256   tree iv_type = NULL_TREE;
1257   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1258   unsigned int iv_precision = UINT_MAX;
1259
1260   if (iv_limit != -1)
1261     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1262                                       UNSIGNED);
1263
1264   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1265     {
1266       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1267       if (cmp_bits >= min_ni_width
1268           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1269         {
1270           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1271           if (this_type
1272               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1273             {
1274               /* Although we could stop as soon as we find a valid mode,
1275                  there are at least two reasons why that's not always the
1276                  best choice:
1277
1278                  - An IV that's Pmode or wider is more likely to be reusable
1279                    in address calculations than an IV that's narrower than
1280                    Pmode.
1281
1282                  - Doing the comparison in IV_PRECISION or wider allows
1283                    a natural 0-based IV, whereas using a narrower comparison
1284                    type requires mitigations against wrap-around.
1285
1286                  Conversely, if the IV limit is variable, doing the comparison
1287                  in a wider type than the original type can introduce
1288                  unnecessary extensions, so picking the widest valid mode
1289                  is not always a good choice either.
1290
1291                  Here we prefer the first IV type that's Pmode or wider,
1292                  and the first comparison type that's IV_PRECISION or wider.
1293                  (The comparison type must be no wider than the IV type,
1294                  to avoid extensions in the vector loop.)
1295
1296                  ??? We might want to try continuing beyond Pmode for ILP32
1297                  targets if CMP_BITS < IV_PRECISION.  */
1298               iv_type = this_type;
1299               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1300                 cmp_type = this_type;
1301               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1302                 break;
1303             }
1304         }
1305     }
1306
1307   if (!cmp_type)
1308     {
1309       LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1310       return false;
1311     }
1312
1313   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1314   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1315   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1316   return true;
1317 }
1318
1319 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1320    whether we can actually generate AVX512 style masks.  Return true if so,
1321    storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE.  */
1322
1323 static bool
1324 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1325 {
1326   /* Produce differently organized rgc_vec and differently check
1327      we can produce masks.  */
1328
1329   /* Use a normal loop if there are no statements that need masking.
1330      This only happens in rare degenerate cases: it means that the loop
1331      has no loads, no stores, and no live-out values.  */
1332   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1333     return false;
1334
1335   /* For the decrementing IV we need to represent all values in
1336      [0, niter + niter_skip] where niter_skip is the elements we
1337      skip in the first iteration for prologue peeling.  */
1338   tree iv_type = NULL_TREE;
1339   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1340   unsigned int iv_precision = UINT_MAX;
1341   if (iv_limit != -1)
1342     iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1343
1344   /* First compute the type for the IV we use to track the remaining
1345      scalar iterations.  */
1346   opt_scalar_int_mode cmp_mode_iter;
1347   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1348     {
1349       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1350       if (cmp_bits >= iv_precision
1351           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1352         {
1353           iv_type = build_nonstandard_integer_type (cmp_bits, true);
1354           if (iv_type)
1355             break;
1356         }
1357     }
1358   if (!iv_type)
1359     return false;
1360
1361   /* Produce the rgroup controls.  */
1362   for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1363     {
1364       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1365       tree vectype = mask.first;
1366       unsigned nvectors = mask.second;
1367
1368       /* The number of scalars per iteration and the number of vectors are
1369          both compile-time constants.  */
1370       unsigned int nscalars_per_iter
1371         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1372                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1373
1374       /* We index the rgroup_controls vector with nscalars_per_iter
1375          which we keep constant and instead have a varying nvectors,
1376          remembering the vector mask with the fewest nV.  */
1377       if (masks->rgc_vec.length () < nscalars_per_iter)
1378         masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1379       rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1380
1381       if (!rgm->type || rgm->factor > nvectors)
1382         {
1383           rgm->type = truth_type_for (vectype);
1384           rgm->compare_type = NULL_TREE;
1385           rgm->max_nscalars_per_iter = nscalars_per_iter;
1386           rgm->factor = nvectors;
1387           rgm->bias_adjusted_ctrl = NULL_TREE;
1388         }
1389     }
1390
1391   /* There is no fixed compare type we are going to use but we have to
1392      be able to get at one for each mask group.  */
1393   unsigned int min_ni_width
1394     = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1395
1396   bool ok = true;
1397   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1398     {
1399       tree mask_type = rgc.type;
1400       if (!mask_type)
1401         continue;
1402
1403       if (TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1404         {
1405           ok = false;
1406           break;
1407         }
1408
1409       /* If iv_type is usable as compare type use that - we can elide the
1410          saturation in that case.   */
1411       if (TYPE_PRECISION (iv_type) >= min_ni_width)
1412         {
1413           tree cmp_vectype
1414             = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1415           if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1416             rgc.compare_type = cmp_vectype;
1417         }
1418       if (!rgc.compare_type)
1419         FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1420           {
1421             unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1422             if (cmp_bits >= min_ni_width
1423                 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1424               {
1425                 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1426                 if (!cmp_type)
1427                   continue;
1428
1429                 /* Check whether we can produce the mask with cmp_type.  */
1430                 tree cmp_vectype
1431                   = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1432                 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1433                   {
1434                     rgc.compare_type = cmp_vectype;
1435                     break;
1436                   }
1437               }
1438         }
1439       if (!rgc.compare_type)
1440         {
1441           ok = false;
1442           break;
1443         }
1444     }
1445   if (!ok)
1446     {
1447       release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1448       return false;
1449     }
1450
1451   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1452   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1453   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1454   return true;
1455 }
1456
1457 /* Check whether we can use vector access with length based on precison
1458    comparison.  So far, to keep it simple, we only allow the case that the
1459    precision of the target supported length is larger than the precision
1460    required by loop niters.  */
1461
1462 static bool
1463 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1464 {
1465   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1466     return false;
1467
1468   machine_mode len_load_mode = get_len_load_store_mode
1469     (loop_vinfo->vector_mode, true).require ();
1470   machine_mode len_store_mode = get_len_load_store_mode
1471     (loop_vinfo->vector_mode, false).require ();
1472
1473   signed char partial_load_bias = internal_len_load_store_bias
1474     (IFN_LEN_LOAD, len_load_mode);
1475
1476   signed char partial_store_bias = internal_len_load_store_bias
1477     (IFN_LEN_STORE, len_store_mode);
1478
1479   gcc_assert (partial_load_bias == partial_store_bias);
1480
1481   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1482     return false;
1483
1484   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1485      len_loads with a length of zero.  In order to avoid that we prohibit
1486      more than one loop length here.  */
1487   if (partial_load_bias == -1
1488       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1489     return false;
1490
1491   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1492
1493   unsigned int max_nitems_per_iter = 1;
1494   unsigned int i;
1495   rgroup_controls *rgl;
1496   /* Find the maximum number of items per iteration for every rgroup.  */
1497   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1498     {
1499       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1500       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1501     }
1502
1503   /* Work out how many bits we need to represent the length limit.  */
1504   unsigned int min_ni_prec
1505     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1506
1507   /* Now use the maximum of below precisions for one suitable IV type:
1508      - the IV's natural precision
1509      - the precision needed to hold: the maximum number of scalar
1510        iterations multiplied by the scale factor (min_ni_prec above)
1511      - the Pmode precision
1512
1513      If min_ni_prec is less than the precision of the current niters,
1514      we perfer to still use the niters type.  Prefer to use Pmode and
1515      wider IV to avoid narrow conversions.  */
1516
1517   unsigned int ni_prec
1518     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1519   min_ni_prec = MAX (min_ni_prec, ni_prec);
1520   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1521
1522   tree iv_type = NULL_TREE;
1523   opt_scalar_int_mode tmode_iter;
1524   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1525     {
1526       scalar_mode tmode = tmode_iter.require ();
1527       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1528
1529       /* ??? Do we really want to construct one IV whose precision exceeds
1530          BITS_PER_WORD?  */
1531       if (tbits > BITS_PER_WORD)
1532         break;
1533
1534       /* Find the first available standard integral type.  */
1535       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1536         {
1537           iv_type = build_nonstandard_integer_type (tbits, true);
1538           break;
1539         }
1540     }
1541
1542   if (!iv_type)
1543     {
1544       if (dump_enabled_p ())
1545         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1546                          "can't vectorize with length-based partial vectors"
1547                          " because there is no suitable iv type.\n");
1548       return false;
1549     }
1550
1551   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1552   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1553   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1554
1555   return true;
1556 }
1557
1558 /* Calculate the cost of one scalar iteration of the loop.  */
1559 static void
1560 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1561 {
1562   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1563   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1564   int nbbs = loop->num_nodes, factor;
1565   int innerloop_iters, i;
1566
1567   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1568
1569   /* Gather costs for statements in the scalar loop.  */
1570
1571   /* FORNOW.  */
1572   innerloop_iters = 1;
1573   if (loop->inner)
1574     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1575
1576   for (i = 0; i < nbbs; i++)
1577     {
1578       gimple_stmt_iterator si;
1579       basic_block bb = bbs[i];
1580
1581       if (bb->loop_father == loop->inner)
1582         factor = innerloop_iters;
1583       else
1584         factor = 1;
1585
1586       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1587         {
1588           gimple *stmt = gsi_stmt (si);
1589           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1590
1591           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1592             continue;
1593
1594           /* Skip stmts that are not vectorized inside the loop.  */
1595           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1596           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1597               && (!STMT_VINFO_LIVE_P (vstmt_info)
1598                   || !VECTORIZABLE_CYCLE_DEF
1599                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1600             continue;
1601
1602           vect_cost_for_stmt kind;
1603           if (STMT_VINFO_DATA_REF (stmt_info))
1604             {
1605               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1606                kind = scalar_load;
1607              else
1608                kind = scalar_store;
1609             }
1610           else if (vect_nop_conversion_p (stmt_info))
1611             continue;
1612           else
1613             kind = scalar_stmt;
1614
1615           /* We are using vect_prologue here to avoid scaling twice
1616              by the inner loop factor.  */
1617           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1618                             factor, kind, stmt_info, 0, vect_prologue);
1619         }
1620     }
1621
1622   /* Now accumulate cost.  */
1623   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1624   add_stmt_costs (loop_vinfo->scalar_costs,
1625                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1626   loop_vinfo->scalar_costs->finish_cost (nullptr);
1627 }
1628
1629
1630 /* Function vect_analyze_loop_form.
1631
1632    Verify that certain CFG restrictions hold, including:
1633    - the loop has a pre-header
1634    - the loop has a single entry and exit
1635    - the loop exit condition is simple enough
1636    - the number of iterations can be analyzed, i.e, a countable loop.  The
1637      niter could be analyzed under some assumptions.  */
1638
1639 opt_result
1640 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1641 {
1642   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1643
1644   /* Different restrictions apply when we are considering an inner-most loop,
1645      vs. an outer (nested) loop.
1646      (FORNOW. May want to relax some of these restrictions in the future).  */
1647
1648   info->inner_loop_cond = NULL;
1649   if (!loop->inner)
1650     {
1651       /* Inner-most loop.  We currently require that the number of BBs is
1652          exactly 2 (the header and latch).  Vectorizable inner-most loops
1653          look like this:
1654
1655                         (pre-header)
1656                            |
1657                           header <--------+
1658                            | |            |
1659                            | +--> latch --+
1660                            |
1661                         (exit-bb)  */
1662
1663       if (loop->num_nodes != 2)
1664         return opt_result::failure_at (vect_location,
1665                                        "not vectorized:"
1666                                        " control flow in loop.\n");
1667
1668       if (empty_block_p (loop->header))
1669         return opt_result::failure_at (vect_location,
1670                                        "not vectorized: empty loop.\n");
1671     }
1672   else
1673     {
1674       class loop *innerloop = loop->inner;
1675       edge entryedge;
1676
1677       /* Nested loop. We currently require that the loop is doubly-nested,
1678          contains a single inner loop, and the number of BBs is exactly 5.
1679          Vectorizable outer-loops look like this:
1680
1681                         (pre-header)
1682                            |
1683                           header <---+
1684                            |         |
1685                           inner-loop |
1686                            |         |
1687                           tail ------+
1688                            |
1689                         (exit-bb)
1690
1691          The inner-loop has the properties expected of inner-most loops
1692          as described above.  */
1693
1694       if ((loop->inner)->inner || (loop->inner)->next)
1695         return opt_result::failure_at (vect_location,
1696                                        "not vectorized:"
1697                                        " multiple nested loops.\n");
1698
1699       if (loop->num_nodes != 5)
1700         return opt_result::failure_at (vect_location,
1701                                        "not vectorized:"
1702                                        " control flow in loop.\n");
1703
1704       entryedge = loop_preheader_edge (innerloop);
1705       if (entryedge->src != loop->header
1706           || !single_exit (innerloop)
1707           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1708         return opt_result::failure_at (vect_location,
1709                                        "not vectorized:"
1710                                        " unsupported outerloop form.\n");
1711
1712       /* Analyze the inner-loop.  */
1713       vect_loop_form_info inner;
1714       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1715       if (!res)
1716         {
1717           if (dump_enabled_p ())
1718             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1719                              "not vectorized: Bad inner loop.\n");
1720           return res;
1721         }
1722
1723       /* Don't support analyzing niter under assumptions for inner
1724          loop.  */
1725       if (!integer_onep (inner.assumptions))
1726         return opt_result::failure_at (vect_location,
1727                                        "not vectorized: Bad inner loop.\n");
1728
1729       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1730         return opt_result::failure_at (vect_location,
1731                                        "not vectorized: inner-loop count not"
1732                                        " invariant.\n");
1733
1734       if (dump_enabled_p ())
1735         dump_printf_loc (MSG_NOTE, vect_location,
1736                          "Considering outer-loop vectorization.\n");
1737       info->inner_loop_cond = inner.loop_cond;
1738     }
1739
1740   if (!single_exit (loop))
1741     return opt_result::failure_at (vect_location,
1742                                    "not vectorized: multiple exits.\n");
1743   if (EDGE_COUNT (loop->header->preds) != 2)
1744     return opt_result::failure_at (vect_location,
1745                                    "not vectorized:"
1746                                    " too many incoming edges.\n");
1747
1748   /* We assume that the loop exit condition is at the end of the loop. i.e,
1749      that the loop is represented as a do-while (with a proper if-guard
1750      before the loop if needed), where the loop header contains all the
1751      executable statements, and the latch is empty.  */
1752   if (!empty_block_p (loop->latch)
1753       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1754     return opt_result::failure_at (vect_location,
1755                                    "not vectorized: latch block not empty.\n");
1756
1757   /* Make sure the exit is not abnormal.  */
1758   edge e = single_exit (loop);
1759   if (e->flags & EDGE_ABNORMAL)
1760     return opt_result::failure_at (vect_location,
1761                                    "not vectorized:"
1762                                    " abnormal loop exit edge.\n");
1763
1764   info->loop_cond
1765     = vect_get_loop_niters (loop, &info->assumptions,
1766                             &info->number_of_iterations,
1767                             &info->number_of_iterationsm1);
1768   if (!info->loop_cond)
1769     return opt_result::failure_at
1770       (vect_location,
1771        "not vectorized: complicated exit condition.\n");
1772
1773   if (integer_zerop (info->assumptions)
1774       || !info->number_of_iterations
1775       || chrec_contains_undetermined (info->number_of_iterations))
1776     return opt_result::failure_at
1777       (info->loop_cond,
1778        "not vectorized: number of iterations cannot be computed.\n");
1779
1780   if (integer_zerop (info->number_of_iterations))
1781     return opt_result::failure_at
1782       (info->loop_cond,
1783        "not vectorized: number of iterations = 0.\n");
1784
1785   if (!(tree_fits_shwi_p (info->number_of_iterations)
1786         && tree_to_shwi (info->number_of_iterations) > 0))
1787     {
1788       if (dump_enabled_p ())
1789         {
1790           dump_printf_loc (MSG_NOTE, vect_location,
1791                            "Symbolic number of iterations is ");
1792           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1793           dump_printf (MSG_NOTE, "\n");
1794         }
1795     }
1796
1797   return opt_result::success ();
1798 }
1799
1800 /* Create a loop_vec_info for LOOP with SHARED and the
1801    vect_analyze_loop_form result.  */
1802
1803 loop_vec_info
1804 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1805                         const vect_loop_form_info *info,
1806                         loop_vec_info main_loop_info)
1807 {
1808   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1809   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1810   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1811   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1812   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1813   /* Also record the assumptions for versioning.  */
1814   if (!integer_onep (info->assumptions) && !main_loop_info)
1815     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1816
1817   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1818   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1819   if (info->inner_loop_cond)
1820     {
1821       stmt_vec_info inner_loop_cond_info
1822         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1823       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1824       /* If we have an estimate on the number of iterations of the inner
1825          loop use that to limit the scale for costing, otherwise use
1826          --param vect-inner-loop-cost-factor literally.  */
1827       widest_int nit;
1828       if (estimated_stmt_executions (loop->inner, &nit))
1829         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1830           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1831     }
1832
1833   return loop_vinfo;
1834 }
1835
1836
1837
1838 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1839    statements update the vectorization factor.  */
1840
1841 static void
1842 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1843 {
1844   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1845   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1846   int nbbs = loop->num_nodes;
1847   poly_uint64 vectorization_factor;
1848   int i;
1849
1850   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1851
1852   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1853   gcc_assert (known_ne (vectorization_factor, 0U));
1854
1855   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1856      vectorization factor of the loop is the unrolling factor required by
1857      the SLP instances.  If that unrolling factor is 1, we say, that we
1858      perform pure SLP on loop - cross iteration parallelism is not
1859      exploited.  */
1860   bool only_slp_in_loop = true;
1861   for (i = 0; i < nbbs; i++)
1862     {
1863       basic_block bb = bbs[i];
1864       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1865            gsi_next (&si))
1866         {
1867           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1868           if (!stmt_info)
1869             continue;
1870           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1871                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1872               && !PURE_SLP_STMT (stmt_info))
1873             /* STMT needs both SLP and loop-based vectorization.  */
1874             only_slp_in_loop = false;
1875         }
1876       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1877            gsi_next (&si))
1878         {
1879           if (is_gimple_debug (gsi_stmt (si)))
1880             continue;
1881           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1882           stmt_info = vect_stmt_to_vectorize (stmt_info);
1883           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1884                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1885               && !PURE_SLP_STMT (stmt_info))
1886             /* STMT needs both SLP and loop-based vectorization.  */
1887             only_slp_in_loop = false;
1888         }
1889     }
1890
1891   if (only_slp_in_loop)
1892     {
1893       if (dump_enabled_p ())
1894         dump_printf_loc (MSG_NOTE, vect_location,
1895                          "Loop contains only SLP stmts\n");
1896       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1897     }
1898   else
1899     {
1900       if (dump_enabled_p ())
1901         dump_printf_loc (MSG_NOTE, vect_location,
1902                          "Loop contains SLP and non-SLP stmts\n");
1903       /* Both the vectorization factor and unroll factor have the form
1904          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1905          so they must have a common multiple.  */
1906       vectorization_factor
1907         = force_common_multiple (vectorization_factor,
1908                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1909     }
1910
1911   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1912   if (dump_enabled_p ())
1913     {
1914       dump_printf_loc (MSG_NOTE, vect_location,
1915                        "Updating vectorization factor to ");
1916       dump_dec (MSG_NOTE, vectorization_factor);
1917       dump_printf (MSG_NOTE, ".\n");
1918     }
1919 }
1920
1921 /* Return true if STMT_INFO describes a double reduction phi and if
1922    the other phi in the reduction is also relevant for vectorization.
1923    This rejects cases such as:
1924
1925       outer1:
1926         x_1 = PHI <x_3(outer2), ...>;
1927         ...
1928
1929       inner:
1930         x_2 = ...;
1931         ...
1932
1933       outer2:
1934         x_3 = PHI <x_2(inner)>;
1935
1936    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1937
1938 static bool
1939 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1940 {
1941   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1942     return false;
1943
1944   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1945 }
1946
1947 /* Function vect_analyze_loop_operations.
1948
1949    Scan the loop stmts and make sure they are all vectorizable.  */
1950
1951 static opt_result
1952 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1953 {
1954   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1955   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1956   int nbbs = loop->num_nodes;
1957   int i;
1958   stmt_vec_info stmt_info;
1959   bool need_to_vectorize = false;
1960   bool ok;
1961
1962   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1963
1964   auto_vec<stmt_info_for_cost> cost_vec;
1965
1966   for (i = 0; i < nbbs; i++)
1967     {
1968       basic_block bb = bbs[i];
1969
1970       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1971            gsi_next (&si))
1972         {
1973           gphi *phi = si.phi ();
1974           ok = true;
1975
1976           stmt_info = loop_vinfo->lookup_stmt (phi);
1977           if (dump_enabled_p ())
1978             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
1979                              (gimple *) phi);
1980           if (virtual_operand_p (gimple_phi_result (phi)))
1981             continue;
1982
1983           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1984              (i.e., a phi in the tail of the outer-loop).  */
1985           if (! is_loop_header_bb_p (bb))
1986             {
1987               /* FORNOW: we currently don't support the case that these phis
1988                  are not used in the outerloop (unless it is double reduction,
1989                  i.e., this phi is vect_reduction_def), cause this case
1990                  requires to actually do something here.  */
1991               if (STMT_VINFO_LIVE_P (stmt_info)
1992                   && !vect_active_double_reduction_p (stmt_info))
1993                 return opt_result::failure_at (phi,
1994                                                "Unsupported loop-closed phi"
1995                                                " in outer-loop.\n");
1996
1997               /* If PHI is used in the outer loop, we check that its operand
1998                  is defined in the inner loop.  */
1999               if (STMT_VINFO_RELEVANT_P (stmt_info))
2000                 {
2001                   tree phi_op;
2002
2003                   if (gimple_phi_num_args (phi) != 1)
2004                     return opt_result::failure_at (phi, "unsupported phi");
2005
2006                   phi_op = PHI_ARG_DEF (phi, 0);
2007                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2008                   if (!op_def_info)
2009                     return opt_result::failure_at (phi, "unsupported phi\n");
2010
2011                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2012                       && (STMT_VINFO_RELEVANT (op_def_info)
2013                           != vect_used_in_outer_by_reduction))
2014                     return opt_result::failure_at (phi, "unsupported phi\n");
2015
2016                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2017                        || (STMT_VINFO_DEF_TYPE (stmt_info)
2018                            == vect_double_reduction_def))
2019                       && !vectorizable_lc_phi (loop_vinfo,
2020                                                stmt_info, NULL, NULL))
2021                     return opt_result::failure_at (phi, "unsupported phi\n");
2022                 }
2023
2024               continue;
2025             }
2026
2027           gcc_assert (stmt_info);
2028
2029           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2030                || STMT_VINFO_LIVE_P (stmt_info))
2031               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2032               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2033             /* A scalar-dependence cycle that we don't support.  */
2034             return opt_result::failure_at (phi,
2035                                            "not vectorized:"
2036                                            " scalar dependence cycle.\n");
2037
2038           if (STMT_VINFO_RELEVANT_P (stmt_info))
2039             {
2040               need_to_vectorize = true;
2041               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2042                   && ! PURE_SLP_STMT (stmt_info))
2043                 ok = vectorizable_induction (loop_vinfo,
2044                                              stmt_info, NULL, NULL,
2045                                              &cost_vec);
2046               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2047                         || (STMT_VINFO_DEF_TYPE (stmt_info)
2048                             == vect_double_reduction_def)
2049                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2050                        && ! PURE_SLP_STMT (stmt_info))
2051                 ok = vectorizable_reduction (loop_vinfo,
2052                                              stmt_info, NULL, NULL, &cost_vec);
2053               else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2054                         == vect_first_order_recurrence)
2055                        && ! PURE_SLP_STMT (stmt_info))
2056                 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2057                                            &cost_vec);
2058             }
2059
2060           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
2061           if (ok
2062               && STMT_VINFO_LIVE_P (stmt_info)
2063               && !PURE_SLP_STMT (stmt_info))
2064             ok = vectorizable_live_operation (loop_vinfo,
2065                                               stmt_info, NULL, NULL, NULL,
2066                                               -1, false, &cost_vec);
2067
2068           if (!ok)
2069             return opt_result::failure_at (phi,
2070                                            "not vectorized: relevant phi not "
2071                                            "supported: %G",
2072                                            static_cast <gimple *> (phi));
2073         }
2074
2075       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2076            gsi_next (&si))
2077         {
2078           gimple *stmt = gsi_stmt (si);
2079           if (!gimple_clobber_p (stmt)
2080               && !is_gimple_debug (stmt))
2081             {
2082               opt_result res
2083                 = vect_analyze_stmt (loop_vinfo,
2084                                      loop_vinfo->lookup_stmt (stmt),
2085                                      &need_to_vectorize,
2086                                      NULL, NULL, &cost_vec);
2087               if (!res)
2088                 return res;
2089             }
2090         }
2091     } /* bbs */
2092
2093   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2094
2095   /* All operations in the loop are either irrelevant (deal with loop
2096      control, or dead), or only used outside the loop and can be moved
2097      out of the loop (e.g. invariants, inductions).  The loop can be
2098      optimized away by scalar optimizations.  We're better off not
2099      touching this loop.  */
2100   if (!need_to_vectorize)
2101     {
2102       if (dump_enabled_p ())
2103         dump_printf_loc (MSG_NOTE, vect_location,
2104                          "All the computation can be taken out of the loop.\n");
2105       return opt_result::failure_at
2106         (vect_location,
2107          "not vectorized: redundant loop. no profit to vectorize.\n");
2108     }
2109
2110   return opt_result::success ();
2111 }
2112
2113 /* Return true if we know that the iteration count is smaller than the
2114    vectorization factor.  Return false if it isn't, or if we can't be sure
2115    either way.  */
2116
2117 static bool
2118 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2119 {
2120   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2121
2122   HOST_WIDE_INT max_niter;
2123   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2124     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2125   else
2126     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2127
2128   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2129     return true;
2130
2131   return false;
2132 }
2133
2134 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
2135    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
2136    definitely no, or -1 if it's worth retrying.  */
2137
2138 static int
2139 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2140                            unsigned *suggested_unroll_factor)
2141 {
2142   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2143   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2144
2145   /* Only loops that can handle partially-populated vectors can have iteration
2146      counts less than the vectorization factor.  */
2147   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2148     {
2149       if (vect_known_niters_smaller_than_vf (loop_vinfo))
2150         {
2151           if (dump_enabled_p ())
2152             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2153                              "not vectorized: iteration count smaller than "
2154                              "vectorization factor.\n");
2155           return 0;
2156         }
2157     }
2158
2159   /* If using the "very cheap" model. reject cases in which we'd keep
2160      a copy of the scalar code (even if we might be able to vectorize it).  */
2161   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2162       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2163           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2164           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2165     {
2166       if (dump_enabled_p ())
2167         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2168                          "some scalar iterations would need to be peeled\n");
2169       return 0;
2170     }
2171
2172   int min_profitable_iters, min_profitable_estimate;
2173   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2174                                       &min_profitable_estimate,
2175                                       suggested_unroll_factor);
2176
2177   if (min_profitable_iters < 0)
2178     {
2179       if (dump_enabled_p ())
2180         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2181                          "not vectorized: vectorization not profitable.\n");
2182       if (dump_enabled_p ())
2183         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2184                          "not vectorized: vector version will never be "
2185                          "profitable.\n");
2186       return -1;
2187     }
2188
2189   int min_scalar_loop_bound = (param_min_vect_loop_bound
2190                                * assumed_vf);
2191
2192   /* Use the cost model only if it is more conservative than user specified
2193      threshold.  */
2194   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2195                                     min_profitable_iters);
2196
2197   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2198
2199   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2200       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2201     {
2202       if (dump_enabled_p ())
2203         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2204                          "not vectorized: vectorization not profitable.\n");
2205       if (dump_enabled_p ())
2206         dump_printf_loc (MSG_NOTE, vect_location,
2207                          "not vectorized: iteration count smaller than user "
2208                          "specified loop bound parameter or minimum profitable "
2209                          "iterations (whichever is more conservative).\n");
2210       return 0;
2211     }
2212
2213   /* The static profitablity threshold min_profitable_estimate includes
2214      the cost of having to check at runtime whether the scalar loop
2215      should be used instead.  If it turns out that we don't need or want
2216      such a check, the threshold we should use for the static estimate
2217      is simply the point at which the vector loop becomes more profitable
2218      than the scalar loop.  */
2219   if (min_profitable_estimate > min_profitable_iters
2220       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2221       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2222       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2223       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2224     {
2225       if (dump_enabled_p ())
2226         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2227                          " choice between the scalar and vector loops\n");
2228       min_profitable_estimate = min_profitable_iters;
2229     }
2230
2231   /* If the vector loop needs multiple iterations to be beneficial then
2232      things are probably too close to call, and the conservative thing
2233      would be to stick with the scalar code.  */
2234   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2235       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2236     {
2237       if (dump_enabled_p ())
2238         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2239                          "one iteration of the vector loop would be"
2240                          " more expensive than the equivalent number of"
2241                          " iterations of the scalar loop\n");
2242       return 0;
2243     }
2244
2245   HOST_WIDE_INT estimated_niter;
2246
2247   /* If we are vectorizing an epilogue then we know the maximum number of
2248      scalar iterations it will cover is at least one lower than the
2249      vectorization factor of the main loop.  */
2250   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2251     estimated_niter
2252       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2253   else
2254     {
2255       estimated_niter = estimated_stmt_executions_int (loop);
2256       if (estimated_niter == -1)
2257         estimated_niter = likely_max_stmt_executions_int (loop);
2258     }
2259   if (estimated_niter != -1
2260       && ((unsigned HOST_WIDE_INT) estimated_niter
2261           < MAX (th, (unsigned) min_profitable_estimate)))
2262     {
2263       if (dump_enabled_p ())
2264         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2265                          "not vectorized: estimated iteration count too "
2266                          "small.\n");
2267       if (dump_enabled_p ())
2268         dump_printf_loc (MSG_NOTE, vect_location,
2269                          "not vectorized: estimated iteration count smaller "
2270                          "than specified loop bound parameter or minimum "
2271                          "profitable iterations (whichever is more "
2272                          "conservative).\n");
2273       return -1;
2274     }
2275
2276   return 1;
2277 }
2278
2279 static opt_result
2280 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2281                            vec<data_reference_p> *datarefs,
2282                            unsigned int *n_stmts)
2283 {
2284   *n_stmts = 0;
2285   for (unsigned i = 0; i < loop->num_nodes; i++)
2286     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2287          !gsi_end_p (gsi); gsi_next (&gsi))
2288       {
2289         gimple *stmt = gsi_stmt (gsi);
2290         if (is_gimple_debug (stmt))
2291           continue;
2292         ++(*n_stmts);
2293         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2294                                                         NULL, 0);
2295         if (!res)
2296           {
2297             if (is_gimple_call (stmt) && loop->safelen)
2298               {
2299                 tree fndecl = gimple_call_fndecl (stmt), op;
2300                 if (fndecl == NULL_TREE
2301                     && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2302                   {
2303                     fndecl = gimple_call_arg (stmt, 0);
2304                     gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2305                     fndecl = TREE_OPERAND (fndecl, 0);
2306                     gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2307                   }
2308                 if (fndecl != NULL_TREE)
2309                   {
2310                     cgraph_node *node = cgraph_node::get (fndecl);
2311                     if (node != NULL && node->simd_clones != NULL)
2312                       {
2313                         unsigned int j, n = gimple_call_num_args (stmt);
2314                         for (j = 0; j < n; j++)
2315                           {
2316                             op = gimple_call_arg (stmt, j);
2317                             if (DECL_P (op)
2318                                 || (REFERENCE_CLASS_P (op)
2319                                     && get_base_address (op)))
2320                               break;
2321                           }
2322                         op = gimple_call_lhs (stmt);
2323                         /* Ignore #pragma omp declare simd functions
2324                            if they don't have data references in the
2325                            call stmt itself.  */
2326                         if (j == n
2327                             && !(op
2328                                  && (DECL_P (op)
2329                                      || (REFERENCE_CLASS_P (op)
2330                                          && get_base_address (op)))))
2331                           continue;
2332                       }
2333                   }
2334               }
2335             return res;
2336           }
2337         /* If dependence analysis will give up due to the limit on the
2338            number of datarefs stop here and fail fatally.  */
2339         if (datarefs->length ()
2340             > (unsigned)param_loop_max_datarefs_for_datadeps)
2341           return opt_result::failure_at (stmt, "exceeded param "
2342                                          "loop-max-datarefs-for-datadeps\n");
2343       }
2344   return opt_result::success ();
2345 }
2346
2347 /* Look for SLP-only access groups and turn each individual access into its own
2348    group.  */
2349 static void
2350 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2351 {
2352   unsigned int i;
2353   struct data_reference *dr;
2354
2355   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2356
2357   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2358   FOR_EACH_VEC_ELT (datarefs, i, dr)
2359     {
2360       gcc_assert (DR_REF (dr));
2361       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2362
2363       /* Check if the load is a part of an interleaving chain.  */
2364       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2365         {
2366           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2367           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2368           unsigned int group_size = DR_GROUP_SIZE (first_element);
2369
2370           /* Check if SLP-only groups.  */
2371           if (!STMT_SLP_TYPE (stmt_info)
2372               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2373             {
2374               /* Dissolve the group.  */
2375               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2376
2377               stmt_vec_info vinfo = first_element;
2378               while (vinfo)
2379                 {
2380                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2381                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2382                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2383                   DR_GROUP_SIZE (vinfo) = 1;
2384                   if (STMT_VINFO_STRIDED_P (first_element))
2385                     DR_GROUP_GAP (vinfo) = 0;
2386                   else
2387                     DR_GROUP_GAP (vinfo) = group_size - 1;
2388                   /* Duplicate and adjust alignment info, it needs to
2389                      be present on each group leader, see dr_misalignment.  */
2390                   if (vinfo != first_element)
2391                     {
2392                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2393                       dr_info2->target_alignment = dr_info->target_alignment;
2394                       int misalignment = dr_info->misalignment;
2395                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2396                         {
2397                           HOST_WIDE_INT diff
2398                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2399                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2400                           unsigned HOST_WIDE_INT align_c
2401                             = dr_info->target_alignment.to_constant ();
2402                           misalignment = (misalignment + diff) % align_c;
2403                         }
2404                       dr_info2->misalignment = misalignment;
2405                     }
2406                   vinfo = next;
2407                 }
2408             }
2409         }
2410     }
2411 }
2412
2413 /* Determine if operating on full vectors for LOOP_VINFO might leave
2414    some scalar iterations still to do.  If so, decide how we should
2415    handle those scalar iterations.  The possibilities are:
2416
2417    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2418        In this case:
2419
2420          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2421          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2422          LOOP_VINFO_PEELING_FOR_NITER == false
2423
2424    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2425        to handle the remaining scalar iterations.  In this case:
2426
2427          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2428          LOOP_VINFO_PEELING_FOR_NITER == true
2429
2430        There are two choices:
2431
2432        (2a) Consider vectorizing the epilogue loop at the same VF as the
2433             main loop, but using partial vectors instead of full vectors.
2434             In this case:
2435
2436               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2437
2438        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2439             In this case:
2440
2441               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2442
2443    When FOR_EPILOGUE_P is true, make this determination based on the
2444    assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2445    based on the assumption that LOOP_VINFO is the main loop.  The caller
2446    has made sure that the number of iterations is set appropriately for
2447    this value of FOR_EPILOGUE_P.  */
2448
2449 opt_result
2450 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2451                                             bool for_epilogue_p)
2452 {
2453   /* Determine whether there would be any scalar iterations left over.  */
2454   bool need_peeling_or_partial_vectors_p
2455     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2456
2457   /* Decide whether to vectorize the loop with partial vectors.  */
2458   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2459   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2460   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2461       && need_peeling_or_partial_vectors_p)
2462     {
2463       /* For partial-vector-usage=1, try to push the handling of partial
2464          vectors to the epilogue, with the main loop continuing to operate
2465          on full vectors.
2466
2467          If we are unrolling we also do not want to use partial vectors. This
2468          is to avoid the overhead of generating multiple masks and also to
2469          avoid having to execute entire iterations of FALSE masked instructions
2470          when dealing with one or less full iterations.
2471
2472          ??? We could then end up failing to use partial vectors if we
2473          decide to peel iterations into a prologue, and if the main loop
2474          then ends up processing fewer than VF iterations.  */
2475       if ((param_vect_partial_vector_usage == 1
2476            || loop_vinfo->suggested_unroll_factor > 1)
2477           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2478           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2479         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2480       else
2481         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2482     }
2483
2484   if (dump_enabled_p ())
2485     {
2486       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2487         dump_printf_loc (MSG_NOTE, vect_location,
2488                          "operating on partial vectors%s.\n",
2489                          for_epilogue_p ? " for epilogue loop" : "");
2490       else
2491         dump_printf_loc (MSG_NOTE, vect_location,
2492                          "operating only on full vectors%s.\n",
2493                          for_epilogue_p ? " for epilogue loop" : "");
2494     }
2495
2496   if (for_epilogue_p)
2497     {
2498       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2499       gcc_assert (orig_loop_vinfo);
2500       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2501         gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2502                               LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2503     }
2504
2505   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2506       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2507     {
2508       /* Check that the loop processes at least one full vector.  */
2509       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2510       tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2511       if (known_lt (wi::to_widest (scalar_niters), vf))
2512         return opt_result::failure_at (vect_location,
2513                                        "loop does not have enough iterations"
2514                                        " to support vectorization.\n");
2515
2516       /* If we need to peel an extra epilogue iteration to handle data
2517          accesses with gaps, check that there are enough scalar iterations
2518          available.
2519
2520          The check above is redundant with this one when peeling for gaps,
2521          but the distinction is useful for diagnostics.  */
2522       tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2523       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2524           && known_lt (wi::to_widest (scalar_nitersm1), vf))
2525         return opt_result::failure_at (vect_location,
2526                                        "loop does not have enough iterations"
2527                                        " to support peeling for gaps.\n");
2528     }
2529
2530   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2531     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2532        && need_peeling_or_partial_vectors_p);
2533
2534   return opt_result::success ();
2535 }
2536
2537 /* Function vect_analyze_loop_2.
2538
2539    Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2540    analyses will record information in some members of LOOP_VINFO.  FATAL
2541    indicates if some analysis meets fatal error.  If one non-NULL pointer
2542    SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2543    worked out suggested unroll factor, while one NULL pointer shows it's
2544    going to apply the suggested unroll factor.  SLP_DONE_FOR_SUGGESTED_UF
2545    is to hold the slp decision when the suggested unroll factor is worked
2546    out.  */
2547 static opt_result
2548 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2549                      unsigned *suggested_unroll_factor,
2550                      bool& slp_done_for_suggested_uf)
2551 {
2552   opt_result ok = opt_result::success ();
2553   int res;
2554   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2555   poly_uint64 min_vf = 2;
2556   loop_vec_info orig_loop_vinfo = NULL;
2557
2558   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2559      loop_vec_info of the first vectorized loop.  */
2560   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2561     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2562   else
2563     orig_loop_vinfo = loop_vinfo;
2564   gcc_assert (orig_loop_vinfo);
2565
2566   /* The first group of checks is independent of the vector size.  */
2567   fatal = true;
2568
2569   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2570       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2571     return opt_result::failure_at (vect_location,
2572                                    "not vectorized: simd if(0)\n");
2573
2574   /* Find all data references in the loop (which correspond to vdefs/vuses)
2575      and analyze their evolution in the loop.  */
2576
2577   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2578
2579   /* Gather the data references and count stmts in the loop.  */
2580   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2581     {
2582       opt_result res
2583         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2584                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2585                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2586       if (!res)
2587         {
2588           if (dump_enabled_p ())
2589             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2590                              "not vectorized: loop contains function "
2591                              "calls or data references that cannot "
2592                              "be analyzed\n");
2593           return res;
2594         }
2595       loop_vinfo->shared->save_datarefs ();
2596     }
2597   else
2598     loop_vinfo->shared->check_datarefs ();
2599
2600   /* Analyze the data references and also adjust the minimal
2601      vectorization factor according to the loads and stores.  */
2602
2603   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2604   if (!ok)
2605     {
2606       if (dump_enabled_p ())
2607         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2608                          "bad data references.\n");
2609       return ok;
2610     }
2611
2612   /* Check if we are applying unroll factor now.  */
2613   bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2614   gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2615
2616   /* If the slp decision is false when suggested unroll factor is worked
2617      out, and we are applying suggested unroll factor, we can simply skip
2618      all slp related analyses this time.  */
2619   bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2620
2621   /* Classify all cross-iteration scalar data-flow cycles.
2622      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2623   vect_analyze_scalar_cycles (loop_vinfo, slp);
2624
2625   vect_pattern_recog (loop_vinfo);
2626
2627   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2628
2629   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2630      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2631
2632   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2633   if (!ok)
2634     {
2635       if (dump_enabled_p ())
2636         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2637                          "bad data access.\n");
2638       return ok;
2639     }
2640
2641   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2642
2643   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2644   if (!ok)
2645     {
2646       if (dump_enabled_p ())
2647         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2648                          "unexpected pattern.\n");
2649       return ok;
2650     }
2651
2652   /* While the rest of the analysis below depends on it in some way.  */
2653   fatal = false;
2654
2655   /* Analyze data dependences between the data-refs in the loop
2656      and adjust the maximum vectorization factor according to
2657      the dependences.
2658      FORNOW: fail at the first data dependence that we encounter.  */
2659
2660   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2661   if (!ok)
2662     {
2663       if (dump_enabled_p ())
2664         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2665                          "bad data dependence.\n");
2666       return ok;
2667     }
2668   if (max_vf != MAX_VECTORIZATION_FACTOR
2669       && maybe_lt (max_vf, min_vf))
2670     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2671   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2672
2673   ok = vect_determine_vectorization_factor (loop_vinfo);
2674   if (!ok)
2675     {
2676       if (dump_enabled_p ())
2677         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2678                          "can't determine vectorization factor.\n");
2679       return ok;
2680     }
2681   if (max_vf != MAX_VECTORIZATION_FACTOR
2682       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2683     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2684
2685   /* Compute the scalar iteration cost.  */
2686   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2687
2688   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2689
2690   if (slp)
2691     {
2692       /* Check the SLP opportunities in the loop, analyze and build
2693          SLP trees.  */
2694       ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2695       if (!ok)
2696         return ok;
2697
2698       /* If there are any SLP instances mark them as pure_slp.  */
2699       slp = vect_make_slp_decision (loop_vinfo);
2700       if (slp)
2701         {
2702           /* Find stmts that need to be both vectorized and SLPed.  */
2703           vect_detect_hybrid_slp (loop_vinfo);
2704
2705           /* Update the vectorization factor based on the SLP decision.  */
2706           vect_update_vf_for_slp (loop_vinfo);
2707
2708           /* Optimize the SLP graph with the vectorization factor fixed.  */
2709           vect_optimize_slp (loop_vinfo);
2710
2711           /* Gather the loads reachable from the SLP graph entries.  */
2712           vect_gather_slp_loads (loop_vinfo);
2713         }
2714     }
2715
2716   bool saved_can_use_partial_vectors_p
2717     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2718
2719   /* We don't expect to have to roll back to anything other than an empty
2720      set of rgroups.  */
2721   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2722
2723   /* This is the point where we can re-start analysis with SLP forced off.  */
2724 start_over:
2725
2726   /* Apply the suggested unrolling factor, this was determined by the backend
2727      during finish_cost the first time we ran the analyzis for this
2728      vector mode.  */
2729   if (applying_suggested_uf)
2730     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2731
2732   /* Now the vectorization factor is final.  */
2733   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2734   gcc_assert (known_ne (vectorization_factor, 0U));
2735
2736   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2737     {
2738       dump_printf_loc (MSG_NOTE, vect_location,
2739                        "vectorization_factor = ");
2740       dump_dec (MSG_NOTE, vectorization_factor);
2741       dump_printf (MSG_NOTE, ", niters = %wd\n",
2742                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2743     }
2744
2745   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2746
2747   /* Analyze the alignment of the data-refs in the loop.
2748      Fail if a data reference is found that cannot be vectorized.  */
2749
2750   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2751   if (!ok)
2752     {
2753       if (dump_enabled_p ())
2754         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2755                          "bad data alignment.\n");
2756       return ok;
2757     }
2758
2759   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2760      It is important to call pruning after vect_analyze_data_ref_accesses,
2761      since we use grouping information gathered by interleaving analysis.  */
2762   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2763   if (!ok)
2764     return ok;
2765
2766   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2767      vectorization, since we do not want to add extra peeling or
2768      add versioning for alignment.  */
2769   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2770     /* This pass will decide on using loop versioning and/or loop peeling in
2771        order to enhance the alignment of data references in the loop.  */
2772     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2773   if (!ok)
2774     return ok;
2775
2776   if (slp)
2777     {
2778       /* Analyze operations in the SLP instances.  Note this may
2779          remove unsupported SLP instances which makes the above
2780          SLP kind detection invalid.  */
2781       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2782       vect_slp_analyze_operations (loop_vinfo);
2783       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2784         {
2785           ok = opt_result::failure_at (vect_location,
2786                                        "unsupported SLP instances\n");
2787           goto again;
2788         }
2789
2790       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2791       slp_tree load_node, slp_root;
2792       unsigned i, x;
2793       slp_instance instance;
2794       bool can_use_lanes = true;
2795       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2796         {
2797           slp_root = SLP_INSTANCE_TREE (instance);
2798           int group_size = SLP_TREE_LANES (slp_root);
2799           tree vectype = SLP_TREE_VECTYPE (slp_root);
2800           bool loads_permuted = false;
2801           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2802             {
2803               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2804                 continue;
2805               unsigned j;
2806               stmt_vec_info load_info;
2807               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2808                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2809                   {
2810                     loads_permuted = true;
2811                     break;
2812                   }
2813             }
2814
2815           /* If the loads and stores can be handled with load/store-lane
2816              instructions record it and move on to the next instance.  */
2817           if (loads_permuted
2818               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2819               && vect_store_lanes_supported (vectype, group_size, false))
2820             {
2821               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2822                 {
2823                   stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2824                       (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2825                   /* Use SLP for strided accesses (or if we can't
2826                      load-lanes).  */
2827                   if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2828                       || ! vect_load_lanes_supported
2829                             (STMT_VINFO_VECTYPE (stmt_vinfo),
2830                              DR_GROUP_SIZE (stmt_vinfo), false))
2831                     break;
2832                 }
2833
2834               can_use_lanes
2835                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2836
2837               if (can_use_lanes && dump_enabled_p ())
2838                 dump_printf_loc (MSG_NOTE, vect_location,
2839                                  "SLP instance %p can use load/store-lanes\n",
2840                                  (void *) instance);
2841             }
2842           else
2843             {
2844               can_use_lanes = false;
2845               break;
2846             }
2847         }
2848
2849       /* If all SLP instances can use load/store-lanes abort SLP and try again
2850          with SLP disabled.  */
2851       if (can_use_lanes)
2852         {
2853           ok = opt_result::failure_at (vect_location,
2854                                        "Built SLP cancelled: can use "
2855                                        "load/store-lanes\n");
2856           if (dump_enabled_p ())
2857             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2858                              "Built SLP cancelled: all SLP instances support "
2859                              "load/store-lanes\n");
2860           goto again;
2861         }
2862     }
2863
2864   /* Dissolve SLP-only groups.  */
2865   vect_dissolve_slp_only_groups (loop_vinfo);
2866
2867   /* Scan all the remaining operations in the loop that are not subject
2868      to SLP and make sure they are vectorizable.  */
2869   ok = vect_analyze_loop_operations (loop_vinfo);
2870   if (!ok)
2871     {
2872       if (dump_enabled_p ())
2873         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2874                          "bad operation or unsupported loop bound.\n");
2875       return ok;
2876     }
2877
2878   /* For now, we don't expect to mix both masking and length approaches for one
2879      loop, disable it if both are recorded.  */
2880   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2881       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2882       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2883     {
2884       if (dump_enabled_p ())
2885         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2886                          "can't vectorize a loop with partial vectors"
2887                          " because we don't expect to mix different"
2888                          " approaches with partial vectors for the"
2889                          " same loop.\n");
2890       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2891     }
2892
2893   /* If we still have the option of using partial vectors,
2894      check whether we can generate the necessary loop controls.  */
2895   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2896     {
2897       if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
2898         {
2899           if (!vect_verify_full_masking (loop_vinfo)
2900               && !vect_verify_full_masking_avx512 (loop_vinfo))
2901             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2902         }
2903       else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
2904         if (!vect_verify_loop_lens (loop_vinfo))
2905           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2906     }
2907
2908   /* If we're vectorizing a loop that uses length "controls" and
2909      can iterate more than once, we apply decrementing IV approach
2910      in loop control.  */
2911   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2912       && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
2913       && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
2914       && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2915            && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
2916                         LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
2917     LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
2918
2919   /* If a loop uses length controls and has a decrementing loop control IV,
2920      we will normally pass that IV through a MIN_EXPR to calcaluate the
2921      basis for the length controls.  E.g. in a loop that processes one
2922      element per scalar iteration, the number of elements would be
2923      MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
2924
2925      This MIN_EXPR approach allows us to use pointer IVs with an invariant
2926      step, since only the final iteration of the vector loop can have
2927      inactive lanes.
2928
2929      However, some targets have a dedicated instruction for calculating the
2930      preferred length, given the total number of elements that still need to
2931      be processed.  This is encapsulated in the SELECT_VL internal function.
2932
2933      If the target supports SELECT_VL, we can use it instead of MIN_EXPR
2934      to determine the basis for the length controls.  However, unlike the
2935      MIN_EXPR calculation, the SELECT_VL calculation can decide to make
2936      lanes inactive in any iteration of the vector loop, not just the last
2937      iteration.  This SELECT_VL approach therefore requires us to use pointer
2938      IVs with variable steps.
2939
2940      Once we've decided how many elements should be processed by one
2941      iteration of the vector loop, we need to populate the rgroup controls.
2942      If a loop has multiple rgroups, we need to make sure that those rgroups
2943      "line up" (that is, they must be consistent about which elements are
2944      active and which aren't).  This is done by vect_adjust_loop_lens_control.
2945
2946      In principle, it would be possible to use vect_adjust_loop_lens_control
2947      on either the result of a MIN_EXPR or the result of a SELECT_VL.
2948      However:
2949
2950      (1) In practice, it only makes sense to use SELECT_VL when a vector
2951          operation will be controlled directly by the result.  It is not
2952          worth using SELECT_VL if it would only be the input to other
2953          calculations.
2954
2955      (2) If we use SELECT_VL for an rgroup that has N controls, each associated
2956          pointer IV will need N updates by a variable amount (N-1 updates
2957          within the iteration and 1 update to move to the next iteration).
2958
2959      Because of this, we prefer to use the MIN_EXPR approach whenever there
2960      is more than one length control.
2961
2962      In addition, SELECT_VL always operates to a granularity of 1 unit.
2963      If we wanted to use it to control an SLP operation on N consecutive
2964      elements, we would need to make the SELECT_VL inputs measure scalar
2965      iterations (rather than elements) and then multiply the SELECT_VL
2966      result by N.  But using SELECT_VL this way is inefficient because
2967      of (1) above.
2968
2969      2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
2970         satisfied:
2971
2972      (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
2973      (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
2974
2975      Since SELECT_VL (variable step) will make SCEV analysis failed and then
2976      we will fail to gain benefits of following unroll optimizations. We prefer
2977      using the MIN_EXPR approach in this situation.  */
2978   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
2979     {
2980       tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
2981       if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
2982                                           OPTIMIZE_FOR_SPEED)
2983           && LOOP_VINFO_LENS (loop_vinfo).length () == 1
2984           && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
2985           && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2986               || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
2987         LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
2988     }
2989
2990   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2991      to be able to handle fewer than VF scalars, or needs to have a lower VF
2992      than the main loop.  */
2993   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2994       && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2995       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2996                    LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2997     return opt_result::failure_at (vect_location,
2998                                    "Vectorization factor too high for"
2999                                    " epilogue loop.\n");
3000
3001   /* Decide whether this loop_vinfo should use partial vectors or peeling,
3002      assuming that the loop will be used as a main loop.  We will redo
3003      this analysis later if we instead decide to use the loop as an
3004      epilogue loop.  */
3005   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
3006   if (!ok)
3007     return ok;
3008
3009   /* Check the costings of the loop make vectorizing worthwhile.  */
3010   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3011   if (res < 0)
3012     {
3013       ok = opt_result::failure_at (vect_location,
3014                                    "Loop costings may not be worthwhile.\n");
3015       goto again;
3016     }
3017   if (!res)
3018     return opt_result::failure_at (vect_location,
3019                                    "Loop costings not worthwhile.\n");
3020
3021   /* If an epilogue loop is required make sure we can create one.  */
3022   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3023       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3024     {
3025       if (dump_enabled_p ())
3026         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3027       if (!vect_can_advance_ivs_p (loop_vinfo)
3028           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
3029                                            single_exit (LOOP_VINFO_LOOP
3030                                                          (loop_vinfo))))
3031         {
3032           ok = opt_result::failure_at (vect_location,
3033                                        "not vectorized: can't create required "
3034                                        "epilog loop\n");
3035           goto again;
3036         }
3037     }
3038
3039   /* During peeling, we need to check if number of loop iterations is
3040      enough for both peeled prolog loop and vector loop.  This check
3041      can be merged along with threshold check of loop versioning, so
3042      increase threshold for this case if necessary.
3043
3044      If we are analyzing an epilogue we still want to check what its
3045      versioning threshold would be.  If we decide to vectorize the epilogues we
3046      will want to use the lowest versioning threshold of all epilogues and main
3047      loop.  This will enable us to enter a vectorized epilogue even when
3048      versioning the loop.  We can't simply check whether the epilogue requires
3049      versioning though since we may have skipped some versioning checks when
3050      analyzing the epilogue.  For instance, checks for alias versioning will be
3051      skipped when dealing with epilogues as we assume we already checked them
3052      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
3053   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3054     {
3055       poly_uint64 niters_th = 0;
3056       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3057
3058       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3059         {
3060           /* Niters for peeled prolog loop.  */
3061           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3062             {
3063               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3064               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3065               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3066             }
3067           else
3068             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3069         }
3070
3071       /* Niters for at least one iteration of vectorized loop.  */
3072       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3073         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3074       /* One additional iteration because of peeling for gap.  */
3075       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3076         niters_th += 1;
3077
3078       /*  Use the same condition as vect_transform_loop to decide when to use
3079           the cost to determine a versioning threshold.  */
3080       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3081           && ordered_p (th, niters_th))
3082         niters_th = ordered_max (poly_uint64 (th), niters_th);
3083
3084       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3085     }
3086
3087   gcc_assert (known_eq (vectorization_factor,
3088                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3089
3090   slp_done_for_suggested_uf = slp;
3091
3092   /* Ok to vectorize!  */
3093   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3094   return opt_result::success ();
3095
3096 again:
3097   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
3098   gcc_assert (!ok);
3099
3100   /* Try again with SLP forced off but if we didn't do any SLP there is
3101      no point in re-trying.  */
3102   if (!slp)
3103     return ok;
3104
3105   /* If the slp decision is true when suggested unroll factor is worked
3106      out, and we are applying suggested unroll factor, we don't need to
3107      re-try any more.  */
3108   if (applying_suggested_uf && slp_done_for_suggested_uf)
3109     return ok;
3110
3111   /* If there are reduction chains re-trying will fail anyway.  */
3112   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3113     return ok;
3114
3115   /* Likewise if the grouped loads or stores in the SLP cannot be handled
3116      via interleaving or lane instructions.  */
3117   slp_instance instance;
3118   slp_tree node;
3119   unsigned i, j;
3120   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3121     {
3122       stmt_vec_info vinfo;
3123       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3124       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3125         continue;
3126       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3127       unsigned int size = DR_GROUP_SIZE (vinfo);
3128       tree vectype = STMT_VINFO_VECTYPE (vinfo);
3129       if (! vect_store_lanes_supported (vectype, size, false)
3130          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3131          && ! vect_grouped_store_supported (vectype, size))
3132         return opt_result::failure_at (vinfo->stmt,
3133                                        "unsupported grouped store\n");
3134       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3135         {
3136           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
3137           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3138           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3139           size = DR_GROUP_SIZE (vinfo);
3140           vectype = STMT_VINFO_VECTYPE (vinfo);
3141           if (! vect_load_lanes_supported (vectype, size, false)
3142               && ! vect_grouped_load_supported (vectype, single_element_p,
3143                                                 size))
3144             return opt_result::failure_at (vinfo->stmt,
3145                                            "unsupported grouped load\n");
3146         }
3147     }
3148
3149   if (dump_enabled_p ())
3150     dump_printf_loc (MSG_NOTE, vect_location,
3151                      "re-trying with SLP disabled\n");
3152
3153   /* Roll back state appropriately.  No SLP this time.  */
3154   slp = false;
3155   /* Restore vectorization factor as it were without SLP.  */
3156   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3157   /* Free the SLP instances.  */
3158   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3159     vect_free_slp_instance (instance);
3160   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3161   /* Reset SLP type to loop_vect on all stmts.  */
3162   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3163     {
3164       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3165       for (gimple_stmt_iterator si = gsi_start_phis (bb);
3166            !gsi_end_p (si); gsi_next (&si))
3167         {
3168           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3169           STMT_SLP_TYPE (stmt_info) = loop_vect;
3170           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3171               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3172             {
3173               /* vectorizable_reduction adjusts reduction stmt def-types,
3174                  restore them to that of the PHI.  */
3175               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3176                 = STMT_VINFO_DEF_TYPE (stmt_info);
3177               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3178                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
3179                 = STMT_VINFO_DEF_TYPE (stmt_info);
3180             }
3181         }
3182       for (gimple_stmt_iterator si = gsi_start_bb (bb);
3183            !gsi_end_p (si); gsi_next (&si))
3184         {
3185           if (is_gimple_debug (gsi_stmt (si)))
3186             continue;
3187           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3188           STMT_SLP_TYPE (stmt_info) = loop_vect;
3189           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3190             {
3191               stmt_vec_info pattern_stmt_info
3192                 = STMT_VINFO_RELATED_STMT (stmt_info);
3193               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3194                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3195
3196               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3197               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3198               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3199                    !gsi_end_p (pi); gsi_next (&pi))
3200                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3201                   = loop_vect;
3202             }
3203         }
3204     }
3205   /* Free optimized alias test DDRS.  */
3206   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3207   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3208   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3209   /* Reset target cost data.  */
3210   delete loop_vinfo->vector_costs;
3211   loop_vinfo->vector_costs = nullptr;
3212   /* Reset accumulated rgroup information.  */
3213   LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3214   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3215   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3216   /* Reset assorted flags.  */
3217   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3218   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3219   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3220   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3221   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3222     = saved_can_use_partial_vectors_p;
3223
3224   goto start_over;
3225 }
3226
3227 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3228    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
3229    OLD_LOOP_VINFO is better unless something specifically indicates
3230    otherwise.
3231
3232    Note that this deliberately isn't a partial order.  */
3233
3234 static bool
3235 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3236                           loop_vec_info old_loop_vinfo)
3237 {
3238   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3239   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3240
3241   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3242   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3243
3244   /* Always prefer a VF of loop->simdlen over any other VF.  */
3245   if (loop->simdlen)
3246     {
3247       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3248       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3249       if (new_simdlen_p != old_simdlen_p)
3250         return new_simdlen_p;
3251     }
3252
3253   const auto *old_costs = old_loop_vinfo->vector_costs;
3254   const auto *new_costs = new_loop_vinfo->vector_costs;
3255   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3256     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3257
3258   return new_costs->better_main_loop_than_p (old_costs);
3259 }
3260
3261 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
3262    true if we should.  */
3263
3264 static bool
3265 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3266                         loop_vec_info old_loop_vinfo)
3267 {
3268   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3269     return false;
3270
3271   if (dump_enabled_p ())
3272     dump_printf_loc (MSG_NOTE, vect_location,
3273                      "***** Preferring vector mode %s to vector mode %s\n",
3274                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
3275                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
3276   return true;
3277 }
3278
3279 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3280    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3281    MODE_I to the next mode useful to analyze.
3282    Return the loop_vinfo on success and wrapped null on failure.  */
3283
3284 static opt_loop_vec_info
3285 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3286                      const vect_loop_form_info *loop_form_info,
3287                      loop_vec_info main_loop_vinfo,
3288                      const vector_modes &vector_modes, unsigned &mode_i,
3289                      machine_mode &autodetected_vector_mode,
3290                      bool &fatal)
3291 {
3292   loop_vec_info loop_vinfo
3293     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3294
3295   machine_mode vector_mode = vector_modes[mode_i];
3296   loop_vinfo->vector_mode = vector_mode;
3297   unsigned int suggested_unroll_factor = 1;
3298   bool slp_done_for_suggested_uf;
3299
3300   /* Run the main analysis.  */
3301   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3302                                         &suggested_unroll_factor,
3303                                         slp_done_for_suggested_uf);
3304   if (dump_enabled_p ())
3305     dump_printf_loc (MSG_NOTE, vect_location,
3306                      "***** Analysis %s with vector mode %s\n",
3307                      res ? "succeeded" : " failed",
3308                      GET_MODE_NAME (loop_vinfo->vector_mode));
3309
3310   if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3311     {
3312       if (dump_enabled_p ())
3313         dump_printf_loc (MSG_NOTE, vect_location,
3314                          "***** Re-trying analysis for unrolling"
3315                          " with unroll factor %d and slp %s.\n",
3316                          suggested_unroll_factor,
3317                          slp_done_for_suggested_uf ? "on" : "off");
3318       loop_vec_info unroll_vinfo
3319         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3320       unroll_vinfo->vector_mode = vector_mode;
3321       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3322       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3323                                                 slp_done_for_suggested_uf);
3324       if (new_res)
3325         {
3326           delete loop_vinfo;
3327           loop_vinfo = unroll_vinfo;
3328         }
3329       else
3330         delete unroll_vinfo;
3331     }
3332
3333   /* Remember the autodetected vector mode.  */
3334   if (vector_mode == VOIDmode)
3335     autodetected_vector_mode = loop_vinfo->vector_mode;
3336
3337   /* Advance mode_i, first skipping modes that would result in the
3338      same analysis result.  */
3339   while (mode_i + 1 < vector_modes.length ()
3340          && vect_chooses_same_modes_p (loop_vinfo,
3341                                        vector_modes[mode_i + 1]))
3342     {
3343       if (dump_enabled_p ())
3344         dump_printf_loc (MSG_NOTE, vect_location,
3345                          "***** The result for vector mode %s would"
3346                          " be the same\n",
3347                          GET_MODE_NAME (vector_modes[mode_i + 1]));
3348       mode_i += 1;
3349     }
3350   if (mode_i + 1 < vector_modes.length ()
3351       && VECTOR_MODE_P (autodetected_vector_mode)
3352       && (related_vector_mode (vector_modes[mode_i + 1],
3353                                GET_MODE_INNER (autodetected_vector_mode))
3354           == autodetected_vector_mode)
3355       && (related_vector_mode (autodetected_vector_mode,
3356                                GET_MODE_INNER (vector_modes[mode_i + 1]))
3357           == vector_modes[mode_i + 1]))
3358     {
3359       if (dump_enabled_p ())
3360         dump_printf_loc (MSG_NOTE, vect_location,
3361                          "***** Skipping vector mode %s, which would"
3362                          " repeat the analysis for %s\n",
3363                          GET_MODE_NAME (vector_modes[mode_i + 1]),
3364                          GET_MODE_NAME (autodetected_vector_mode));
3365       mode_i += 1;
3366     }
3367   mode_i++;
3368
3369   if (!res)
3370     {
3371       delete loop_vinfo;
3372       if (fatal)
3373         gcc_checking_assert (main_loop_vinfo == NULL);
3374       return opt_loop_vec_info::propagate_failure (res);
3375     }
3376
3377   return opt_loop_vec_info::success (loop_vinfo);
3378 }
3379
3380 /* Function vect_analyze_loop.
3381
3382    Apply a set of analyses on LOOP, and create a loop_vec_info struct
3383    for it.  The different analyses will record information in the
3384    loop_vec_info struct.  */
3385 opt_loop_vec_info
3386 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3387 {
3388   DUMP_VECT_SCOPE ("analyze_loop_nest");
3389
3390   if (loop_outer (loop)
3391       && loop_vec_info_for_loop (loop_outer (loop))
3392       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3393     return opt_loop_vec_info::failure_at (vect_location,
3394                                           "outer-loop already vectorized.\n");
3395
3396   if (!find_loop_nest (loop, &shared->loop_nest))
3397     return opt_loop_vec_info::failure_at
3398       (vect_location,
3399        "not vectorized: loop nest containing two or more consecutive inner"
3400        " loops cannot be vectorized\n");
3401
3402   /* Analyze the loop form.  */
3403   vect_loop_form_info loop_form_info;
3404   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3405   if (!res)
3406     {
3407       if (dump_enabled_p ())
3408         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3409                          "bad loop form.\n");
3410       return opt_loop_vec_info::propagate_failure (res);
3411     }
3412   if (!integer_onep (loop_form_info.assumptions))
3413     {
3414       /* We consider to vectorize this loop by versioning it under
3415          some assumptions.  In order to do this, we need to clear
3416          existing information computed by scev and niter analyzer.  */
3417       scev_reset_htab ();
3418       free_numbers_of_iterations_estimates (loop);
3419       /* Also set flag for this loop so that following scev and niter
3420          analysis are done under the assumptions.  */
3421       loop_constraint_set (loop, LOOP_C_FINITE);
3422     }
3423
3424   auto_vector_modes vector_modes;
3425   /* Autodetect first vector size we try.  */
3426   vector_modes.safe_push (VOIDmode);
3427   unsigned int autovec_flags
3428     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3429                                                     loop->simdlen != 0);
3430   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3431                              && !unlimited_cost_model (loop));
3432   machine_mode autodetected_vector_mode = VOIDmode;
3433   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3434   unsigned int mode_i = 0;
3435   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3436
3437   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3438      a mode has not been analyzed.  */
3439   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3440   for (unsigned i = 0; i < vector_modes.length (); ++i)
3441     cached_vf_per_mode.safe_push (0);
3442
3443   /* First determine the main loop vectorization mode, either the first
3444      one that works, starting with auto-detecting the vector mode and then
3445      following the targets order of preference, or the one with the
3446      lowest cost if pick_lowest_cost_p.  */
3447   while (1)
3448     {
3449       bool fatal;
3450       unsigned int last_mode_i = mode_i;
3451       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3452          failed.  */
3453       cached_vf_per_mode[last_mode_i] = -1;
3454       opt_loop_vec_info loop_vinfo
3455         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3456                                NULL, vector_modes, mode_i,
3457                                autodetected_vector_mode, fatal);
3458       if (fatal)
3459         break;
3460
3461       if (loop_vinfo)
3462         {
3463           /*  Analyzis has been successful so update the VF value.  The
3464               VF should always be a multiple of unroll_factor and we want to
3465               capture the original VF here.  */
3466           cached_vf_per_mode[last_mode_i]
3467             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3468                          loop_vinfo->suggested_unroll_factor);
3469           /* Once we hit the desired simdlen for the first time,
3470              discard any previous attempts.  */
3471           if (simdlen
3472               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3473             {
3474               delete first_loop_vinfo;
3475               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3476               simdlen = 0;
3477             }
3478           else if (pick_lowest_cost_p
3479                    && first_loop_vinfo
3480                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3481             {
3482               /* Pick loop_vinfo over first_loop_vinfo.  */
3483               delete first_loop_vinfo;
3484               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3485             }
3486           if (first_loop_vinfo == NULL)
3487             first_loop_vinfo = loop_vinfo;
3488           else
3489             {
3490               delete loop_vinfo;
3491               loop_vinfo = opt_loop_vec_info::success (NULL);
3492             }
3493
3494           /* Commit to first_loop_vinfo if we have no reason to try
3495              alternatives.  */
3496           if (!simdlen && !pick_lowest_cost_p)
3497             break;
3498         }
3499       if (mode_i == vector_modes.length ()
3500           || autodetected_vector_mode == VOIDmode)
3501         break;
3502
3503       /* Try the next biggest vector size.  */
3504       if (dump_enabled_p ())
3505         dump_printf_loc (MSG_NOTE, vect_location,
3506                          "***** Re-trying analysis with vector mode %s\n",
3507                          GET_MODE_NAME (vector_modes[mode_i]));
3508     }
3509   if (!first_loop_vinfo)
3510     return opt_loop_vec_info::propagate_failure (res);
3511
3512   if (dump_enabled_p ())
3513     dump_printf_loc (MSG_NOTE, vect_location,
3514                      "***** Choosing vector mode %s\n",
3515                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3516
3517   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3518      enabled, SIMDUID is not set, it is the innermost loop and we have
3519      either already found the loop's SIMDLEN or there was no SIMDLEN to
3520      begin with.
3521      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3522   bool vect_epilogues = (!simdlen
3523                          && loop->inner == NULL
3524                          && param_vect_epilogues_nomask
3525                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3526                          && !loop->simduid);
3527   if (!vect_epilogues)
3528     return first_loop_vinfo;
3529
3530   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3531   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3532
3533   /* For epilogues start the analysis from the first mode.  The motivation
3534      behind starting from the beginning comes from cases where the VECTOR_MODES
3535      array may contain length-agnostic and length-specific modes.  Their
3536      ordering is not guaranteed, so we could end up picking a mode for the main
3537      loop that is after the epilogue's optimal mode.  */
3538   vector_modes[0] = autodetected_vector_mode;
3539   mode_i = 0;
3540
3541   bool supports_partial_vectors =
3542     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3543   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3544
3545   while (1)
3546     {
3547       /* If the target does not support partial vectors we can shorten the
3548          number of modes to analyze for the epilogue as we know we can't pick a
3549          mode that would lead to a VF at least as big as the
3550          FIRST_VINFO_VF.  */
3551       if (!supports_partial_vectors
3552           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3553         {
3554           mode_i++;
3555           if (mode_i == vector_modes.length ())
3556             break;
3557           continue;
3558         }
3559
3560       if (dump_enabled_p ())
3561         dump_printf_loc (MSG_NOTE, vect_location,
3562                          "***** Re-trying epilogue analysis with vector "
3563                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3564
3565       bool fatal;
3566       opt_loop_vec_info loop_vinfo
3567         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3568                                first_loop_vinfo,
3569                                vector_modes, mode_i,
3570                                autodetected_vector_mode, fatal);
3571       if (fatal)
3572         break;
3573
3574       if (loop_vinfo)
3575         {
3576           if (pick_lowest_cost_p)
3577             {
3578               /* Keep trying to roll back vectorization attempts while the
3579                  loop_vec_infos they produced were worse than this one.  */
3580               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3581               while (!vinfos.is_empty ()
3582                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3583                 {
3584                   gcc_assert (vect_epilogues);
3585                   delete vinfos.pop ();
3586                 }
3587             }
3588           /* For now only allow one epilogue loop.  */
3589           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3590             {
3591               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3592               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3593               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3594                           || maybe_ne (lowest_th, 0U));
3595               /* Keep track of the known smallest versioning
3596                  threshold.  */
3597               if (ordered_p (lowest_th, th))
3598                 lowest_th = ordered_min (lowest_th, th);
3599             }
3600           else
3601             {
3602               delete loop_vinfo;
3603               loop_vinfo = opt_loop_vec_info::success (NULL);
3604             }
3605
3606           /* For now only allow one epilogue loop, but allow
3607              pick_lowest_cost_p to replace it, so commit to the
3608              first epilogue if we have no reason to try alternatives.  */
3609           if (!pick_lowest_cost_p)
3610             break;
3611         }
3612
3613       if (mode_i == vector_modes.length ())
3614         break;
3615
3616     }
3617
3618   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3619     {
3620       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3621       if (dump_enabled_p ())
3622         dump_printf_loc (MSG_NOTE, vect_location,
3623                          "***** Choosing epilogue vector mode %s\n",
3624                          GET_MODE_NAME
3625                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3626     }
3627
3628   return first_loop_vinfo;
3629 }
3630
3631 /* Return true if there is an in-order reduction function for CODE, storing
3632    it in *REDUC_FN if so.  */
3633
3634 static bool
3635 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3636 {
3637   if (code == PLUS_EXPR)
3638     {
3639       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3640       return true;
3641     }
3642   return false;
3643 }
3644
3645 /* Function reduction_fn_for_scalar_code
3646
3647    Input:
3648    CODE - tree_code of a reduction operations.
3649
3650    Output:
3651    REDUC_FN - the corresponding internal function to be used to reduce the
3652       vector of partial results into a single scalar result, or IFN_LAST
3653       if the operation is a supported reduction operation, but does not have
3654       such an internal function.
3655
3656    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3657
3658 bool
3659 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3660 {
3661   if (code.is_tree_code ())
3662     switch (tree_code (code))
3663       {
3664       case MAX_EXPR:
3665         *reduc_fn = IFN_REDUC_MAX;
3666         return true;
3667
3668       case MIN_EXPR:
3669         *reduc_fn = IFN_REDUC_MIN;
3670         return true;
3671
3672       case PLUS_EXPR:
3673         *reduc_fn = IFN_REDUC_PLUS;
3674         return true;
3675
3676       case BIT_AND_EXPR:
3677         *reduc_fn = IFN_REDUC_AND;
3678         return true;
3679
3680       case BIT_IOR_EXPR:
3681         *reduc_fn = IFN_REDUC_IOR;
3682         return true;
3683
3684       case BIT_XOR_EXPR:
3685         *reduc_fn = IFN_REDUC_XOR;
3686         return true;
3687
3688       case MULT_EXPR:
3689       case MINUS_EXPR:
3690         *reduc_fn = IFN_LAST;
3691         return true;
3692
3693       default:
3694         return false;
3695       }
3696   else
3697     switch (combined_fn (code))
3698       {
3699       CASE_CFN_FMAX:
3700         *reduc_fn = IFN_REDUC_FMAX;
3701         return true;
3702
3703       CASE_CFN_FMIN:
3704         *reduc_fn = IFN_REDUC_FMIN;
3705         return true;
3706
3707       default:
3708         return false;
3709       }
3710 }
3711
3712 /* If there is a neutral value X such that a reduction would not be affected
3713    by the introduction of additional X elements, return that X, otherwise
3714    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3715    of the scalar elements.  If the reduction has just a single initial value
3716    then INITIAL_VALUE is that value, otherwise it is null.  */
3717
3718 tree
3719 neutral_op_for_reduction (tree scalar_type, code_helper code,
3720                           tree initial_value)
3721 {
3722   if (code.is_tree_code ())
3723     switch (tree_code (code))
3724       {
3725       case WIDEN_SUM_EXPR:
3726       case DOT_PROD_EXPR:
3727       case SAD_EXPR:
3728       case PLUS_EXPR:
3729       case MINUS_EXPR:
3730       case BIT_IOR_EXPR:
3731       case BIT_XOR_EXPR:
3732         return build_zero_cst (scalar_type);
3733
3734       case MULT_EXPR:
3735         return build_one_cst (scalar_type);
3736
3737       case BIT_AND_EXPR:
3738         return build_all_ones_cst (scalar_type);
3739
3740       case MAX_EXPR:
3741       case MIN_EXPR:
3742         return initial_value;
3743
3744       default:
3745         return NULL_TREE;
3746       }
3747   else
3748     switch (combined_fn (code))
3749       {
3750       CASE_CFN_FMIN:
3751       CASE_CFN_FMAX:
3752         return initial_value;
3753
3754       default:
3755         return NULL_TREE;
3756       }
3757 }
3758
3759 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3760    STMT is printed with a message MSG. */
3761
3762 static void
3763 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3764 {
3765   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3766 }
3767
3768 /* Return true if we need an in-order reduction for operation CODE
3769    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3770    overflow must wrap.  */
3771
3772 bool
3773 needs_fold_left_reduction_p (tree type, code_helper code)
3774 {
3775   /* CHECKME: check for !flag_finite_math_only too?  */
3776   if (SCALAR_FLOAT_TYPE_P (type))
3777     {
3778       if (code.is_tree_code ())
3779         switch (tree_code (code))
3780           {
3781           case MIN_EXPR:
3782           case MAX_EXPR:
3783             return false;
3784
3785           default:
3786             return !flag_associative_math;
3787           }
3788       else
3789         switch (combined_fn (code))
3790           {
3791           CASE_CFN_FMIN:
3792           CASE_CFN_FMAX:
3793             return false;
3794
3795           default:
3796             return !flag_associative_math;
3797           }
3798     }
3799
3800   if (INTEGRAL_TYPE_P (type))
3801     return (!code.is_tree_code ()
3802             || !operation_no_trapping_overflow (type, tree_code (code)));
3803
3804   if (SAT_FIXED_POINT_TYPE_P (type))
3805     return true;
3806
3807   return false;
3808 }
3809
3810 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3811    has a handled computation expression.  Store the main reduction
3812    operation in *CODE.  */
3813
3814 static bool
3815 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3816                       tree loop_arg, code_helper *code,
3817                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3818 {
3819   auto_bitmap visited;
3820   tree lookfor = PHI_RESULT (phi);
3821   ssa_op_iter curri;
3822   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3823   while (USE_FROM_PTR (curr) != loop_arg)
3824     curr = op_iter_next_use (&curri);
3825   curri.i = curri.numops;
3826   do
3827     {
3828       path.safe_push (std::make_pair (curri, curr));
3829       tree use = USE_FROM_PTR (curr);
3830       if (use == lookfor)
3831         break;
3832       gimple *def = SSA_NAME_DEF_STMT (use);
3833       if (gimple_nop_p (def)
3834           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3835         {
3836 pop:
3837           do
3838             {
3839               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3840               curri = x.first;
3841               curr = x.second;
3842               do
3843                 curr = op_iter_next_use (&curri);
3844               /* Skip already visited or non-SSA operands (from iterating
3845                  over PHI args).  */
3846               while (curr != NULL_USE_OPERAND_P
3847                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3848                          || ! bitmap_set_bit (visited,
3849                                               SSA_NAME_VERSION
3850                                                 (USE_FROM_PTR (curr)))));
3851             }
3852           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3853           if (curr == NULL_USE_OPERAND_P)
3854             break;
3855         }
3856       else
3857         {
3858           if (gimple_code (def) == GIMPLE_PHI)
3859             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3860           else
3861             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3862           while (curr != NULL_USE_OPERAND_P
3863                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3864                      || ! bitmap_set_bit (visited,
3865                                           SSA_NAME_VERSION
3866                                             (USE_FROM_PTR (curr)))))
3867             curr = op_iter_next_use (&curri);
3868           if (curr == NULL_USE_OPERAND_P)
3869             goto pop;
3870         }
3871     }
3872   while (1);
3873   if (dump_file && (dump_flags & TDF_DETAILS))
3874     {
3875       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3876       unsigned i;
3877       std::pair<ssa_op_iter, use_operand_p> *x;
3878       FOR_EACH_VEC_ELT (path, i, x)
3879         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3880       dump_printf (MSG_NOTE, "\n");
3881     }
3882
3883   /* Check whether the reduction path detected is valid.  */
3884   bool fail = path.length () == 0;
3885   bool neg = false;
3886   int sign = -1;
3887   *code = ERROR_MARK;
3888   for (unsigned i = 1; i < path.length (); ++i)
3889     {
3890       gimple *use_stmt = USE_STMT (path[i].second);
3891       gimple_match_op op;
3892       if (!gimple_extract_op (use_stmt, &op))
3893         {
3894           fail = true;
3895           break;
3896         }
3897       unsigned int opi = op.num_ops;
3898       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3899         {
3900           /* The following make sure we can compute the operand index
3901              easily plus it mostly disallows chaining via COND_EXPR condition
3902              operands.  */
3903           for (opi = 0; opi < op.num_ops; ++opi)
3904             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3905               break;
3906         }
3907       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3908         {
3909           for (opi = 0; opi < op.num_ops; ++opi)
3910             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3911               break;
3912         }
3913       if (opi == op.num_ops)
3914         {
3915           fail = true;
3916           break;
3917         }
3918       op.code = canonicalize_code (op.code, op.type);
3919       if (op.code == MINUS_EXPR)
3920         {
3921           op.code = PLUS_EXPR;
3922           /* Track whether we negate the reduction value each iteration.  */
3923           if (op.ops[1] == op.ops[opi])
3924             neg = ! neg;
3925         }
3926       if (CONVERT_EXPR_CODE_P (op.code)
3927           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3928         ;
3929       else if (*code == ERROR_MARK)
3930         {
3931           *code = op.code;
3932           sign = TYPE_SIGN (op.type);
3933         }
3934       else if (op.code != *code)
3935         {
3936           fail = true;
3937           break;
3938         }
3939       else if ((op.code == MIN_EXPR
3940                 || op.code == MAX_EXPR)
3941                && sign != TYPE_SIGN (op.type))
3942         {
3943           fail = true;
3944           break;
3945         }
3946       /* Check there's only a single stmt the op is used on.  For the
3947          not value-changing tail and the last stmt allow out-of-loop uses.
3948          ???  We could relax this and handle arbitrary live stmts by
3949          forcing a scalar epilogue for example.  */
3950       imm_use_iterator imm_iter;
3951       gimple *op_use_stmt;
3952       unsigned cnt = 0;
3953       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3954         if (!is_gimple_debug (op_use_stmt)
3955             && (*code != ERROR_MARK
3956                 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3957           {
3958             /* We want to allow x + x but not x < 1 ? x : 2.  */
3959             if (is_gimple_assign (op_use_stmt)
3960                 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3961               {
3962                 use_operand_p use_p;
3963                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3964                   cnt++;
3965               }
3966             else
3967               cnt++;
3968           }
3969       if (cnt != 1)
3970         {
3971           fail = true;
3972           break;
3973         }
3974     }
3975   return ! fail && ! neg && *code != ERROR_MARK;
3976 }
3977
3978 bool
3979 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3980                       tree loop_arg, enum tree_code code)
3981 {
3982   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3983   code_helper code_;
3984   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3985           && code_ == code);
3986 }
3987
3988
3989
3990 /* Function vect_is_simple_reduction
3991
3992    (1) Detect a cross-iteration def-use cycle that represents a simple
3993    reduction computation.  We look for the following pattern:
3994
3995    loop_header:
3996      a1 = phi < a0, a2 >
3997      a3 = ...
3998      a2 = operation (a3, a1)
3999
4000    or
4001
4002    a3 = ...
4003    loop_header:
4004      a1 = phi < a0, a2 >
4005      a2 = operation (a3, a1)
4006
4007    such that:
4008    1. operation is commutative and associative and it is safe to
4009       change the order of the computation
4010    2. no uses for a2 in the loop (a2 is used out of the loop)
4011    3. no uses of a1 in the loop besides the reduction operation
4012    4. no uses of a1 outside the loop.
4013
4014    Conditions 1,4 are tested here.
4015    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4016
4017    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4018    nested cycles.
4019
4020    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4021    reductions:
4022
4023      a1 = phi < a0, a2 >
4024      inner loop (def of a3)
4025      a2 = phi < a3 >
4026
4027    (4) Detect condition expressions, ie:
4028      for (int i = 0; i < N; i++)
4029        if (a[i] < val)
4030         ret_val = a[i];
4031
4032 */
4033
4034 static stmt_vec_info
4035 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4036                           bool *double_reduc, bool *reduc_chain_p, bool slp)
4037 {
4038   gphi *phi = as_a <gphi *> (phi_info->stmt);
4039   gimple *phi_use_stmt = NULL;
4040   imm_use_iterator imm_iter;
4041   use_operand_p use_p;
4042
4043   *double_reduc = false;
4044   *reduc_chain_p = false;
4045   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4046
4047   tree phi_name = PHI_RESULT (phi);
4048   /* ???  If there are no uses of the PHI result the inner loop reduction
4049      won't be detected as possibly double-reduction by vectorizable_reduction
4050      because that tries to walk the PHI arg from the preheader edge which
4051      can be constant.  See PR60382.  */
4052   if (has_zero_uses (phi_name))
4053     return NULL;
4054   class loop *loop = (gimple_bb (phi))->loop_father;
4055   unsigned nphi_def_loop_uses = 0;
4056   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4057     {
4058       gimple *use_stmt = USE_STMT (use_p);
4059       if (is_gimple_debug (use_stmt))
4060         continue;
4061
4062       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4063         {
4064           if (dump_enabled_p ())
4065             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4066                              "intermediate value used outside loop.\n");
4067
4068           return NULL;
4069         }
4070
4071       nphi_def_loop_uses++;
4072       phi_use_stmt = use_stmt;
4073     }
4074
4075   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4076   if (TREE_CODE (latch_def) != SSA_NAME)
4077     {
4078       if (dump_enabled_p ())
4079         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4080                          "reduction: not ssa_name: %T\n", latch_def);
4081       return NULL;
4082     }
4083
4084   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4085   if (!def_stmt_info
4086       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4087     return NULL;
4088
4089   bool nested_in_vect_loop
4090     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4091   unsigned nlatch_def_loop_uses = 0;
4092   auto_vec<gphi *, 3> lcphis;
4093   bool inner_loop_of_double_reduc = false;
4094   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4095     {
4096       gimple *use_stmt = USE_STMT (use_p);
4097       if (is_gimple_debug (use_stmt))
4098         continue;
4099       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4100         nlatch_def_loop_uses++;
4101       else
4102         {
4103           /* We can have more than one loop-closed PHI.  */
4104           lcphis.safe_push (as_a <gphi *> (use_stmt));
4105           if (nested_in_vect_loop
4106               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4107                   == vect_double_reduction_def))
4108             inner_loop_of_double_reduc = true;
4109         }
4110     }
4111
4112   /* If we are vectorizing an inner reduction we are executing that
4113      in the original order only in case we are not dealing with a
4114      double reduction.  */
4115   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4116     {
4117       if (dump_enabled_p ())
4118         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4119                         "detected nested cycle: ");
4120       return def_stmt_info;
4121     }
4122
4123   /* When the inner loop of a double reduction ends up with more than
4124      one loop-closed PHI we have failed to classify alternate such
4125      PHIs as double reduction, leading to wrong code.  See PR103237.  */
4126   if (inner_loop_of_double_reduc && lcphis.length () != 1)
4127     {
4128       if (dump_enabled_p ())
4129         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4130                          "unhandle double reduction\n");
4131       return NULL;
4132     }
4133
4134   /* If this isn't a nested cycle or if the nested cycle reduction value
4135      is used ouside of the inner loop we cannot handle uses of the reduction
4136      value.  */
4137   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4138     {
4139       if (dump_enabled_p ())
4140         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4141                          "reduction used in loop.\n");
4142       return NULL;
4143     }
4144
4145   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4146      defined in the inner loop.  */
4147   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4148     {
4149       tree op1 = PHI_ARG_DEF (def_stmt, 0);
4150       if (gimple_phi_num_args (def_stmt) != 1
4151           || TREE_CODE (op1) != SSA_NAME)
4152         {
4153           if (dump_enabled_p ())
4154             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4155                              "unsupported phi node definition.\n");
4156
4157           return NULL;
4158         }
4159
4160       /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4161          and the latch definition op1.  */
4162       gimple *def1 = SSA_NAME_DEF_STMT (op1);
4163       if (gimple_bb (def1)
4164           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4165           && loop->inner
4166           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4167           && (is_gimple_assign (def1) || is_gimple_call (def1))
4168           && is_a <gphi *> (phi_use_stmt)
4169           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4170           && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4171                                             loop_latch_edge (loop->inner))))
4172         {
4173           if (dump_enabled_p ())
4174             report_vect_op (MSG_NOTE, def_stmt,
4175                             "detected double reduction: ");
4176
4177           *double_reduc = true;
4178           return def_stmt_info;
4179         }
4180
4181       return NULL;
4182     }
4183
4184   /* Look for the expression computing latch_def from then loop PHI result.  */
4185   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4186   code_helper code;
4187   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4188                             path))
4189     {
4190       STMT_VINFO_REDUC_CODE (phi_info) = code;
4191       if (code == COND_EXPR && !nested_in_vect_loop)
4192         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4193
4194       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4195          reduction chain for which the additional restriction is that
4196          all operations in the chain are the same.  */
4197       auto_vec<stmt_vec_info, 8> reduc_chain;
4198       unsigned i;
4199       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4200       for (i = path.length () - 1; i >= 1; --i)
4201         {
4202           gimple *stmt = USE_STMT (path[i].second);
4203           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4204           gimple_match_op op;
4205           if (!gimple_extract_op (stmt, &op))
4206             gcc_unreachable ();
4207           if (gassign *assign = dyn_cast<gassign *> (stmt))
4208             STMT_VINFO_REDUC_IDX (stmt_info)
4209               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4210           else
4211             {
4212               gcall *call = as_a<gcall *> (stmt);
4213               STMT_VINFO_REDUC_IDX (stmt_info)
4214                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4215             }
4216           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4217                                      && (i == 1 || i == path.length () - 1));
4218           if ((op.code != code && !leading_conversion)
4219               /* We can only handle the final value in epilogue
4220                  generation for reduction chains.  */
4221               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4222             is_slp_reduc = false;
4223           /* For reduction chains we support a trailing/leading
4224              conversions.  We do not store those in the actual chain.  */
4225           if (leading_conversion)
4226             continue;
4227           reduc_chain.safe_push (stmt_info);
4228         }
4229       if (slp && is_slp_reduc && reduc_chain.length () > 1)
4230         {
4231           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4232             {
4233               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4234               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4235             }
4236           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4237           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4238
4239           /* Save the chain for further analysis in SLP detection.  */
4240           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4241           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4242
4243           *reduc_chain_p = true;
4244           if (dump_enabled_p ())
4245             dump_printf_loc (MSG_NOTE, vect_location,
4246                             "reduction: detected reduction chain\n");
4247         }
4248       else if (dump_enabled_p ())
4249         dump_printf_loc (MSG_NOTE, vect_location,
4250                          "reduction: detected reduction\n");
4251
4252       return def_stmt_info;
4253     }
4254
4255   if (dump_enabled_p ())
4256     dump_printf_loc (MSG_NOTE, vect_location,
4257                      "reduction: unknown pattern\n");
4258
4259   return NULL;
4260 }
4261
4262 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4263    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4264    or -1 if not known.  */
4265
4266 static int
4267 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4268 {
4269   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4270   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4271     {
4272       if (dump_enabled_p ())
4273         dump_printf_loc (MSG_NOTE, vect_location,
4274                          "cost model: epilogue peel iters set to vf/2 "
4275                          "because loop iterations are unknown .\n");
4276       return assumed_vf / 2;
4277     }
4278   else
4279     {
4280       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4281       peel_iters_prologue = MIN (niters, peel_iters_prologue);
4282       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4283       /* If we need to peel for gaps, but no peeling is required, we have to
4284          peel VF iterations.  */
4285       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4286         peel_iters_epilogue = assumed_vf;
4287       return peel_iters_epilogue;
4288     }
4289 }
4290
4291 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
4292 int
4293 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4294                              int *peel_iters_epilogue,
4295                              stmt_vector_for_cost *scalar_cost_vec,
4296                              stmt_vector_for_cost *prologue_cost_vec,
4297                              stmt_vector_for_cost *epilogue_cost_vec)
4298 {
4299   int retval = 0;
4300
4301   *peel_iters_epilogue
4302     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4303
4304   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4305     {
4306       /* If peeled iterations are known but number of scalar loop
4307          iterations are unknown, count a taken branch per peeled loop.  */
4308       if (peel_iters_prologue > 0)
4309         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4310                                    vect_prologue);
4311       if (*peel_iters_epilogue > 0)
4312         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4313                                     vect_epilogue);
4314     }
4315
4316   stmt_info_for_cost *si;
4317   int j;
4318   if (peel_iters_prologue)
4319     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4320       retval += record_stmt_cost (prologue_cost_vec,
4321                                   si->count * peel_iters_prologue,
4322                                   si->kind, si->stmt_info, si->misalign,
4323                                   vect_prologue);
4324   if (*peel_iters_epilogue)
4325     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4326       retval += record_stmt_cost (epilogue_cost_vec,
4327                                   si->count * *peel_iters_epilogue,
4328                                   si->kind, si->stmt_info, si->misalign,
4329                                   vect_epilogue);
4330
4331   return retval;
4332 }
4333
4334 /* Function vect_estimate_min_profitable_iters
4335
4336    Return the number of iterations required for the vector version of the
4337    loop to be profitable relative to the cost of the scalar version of the
4338    loop.
4339
4340    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4341    of iterations for vectorization.  -1 value means loop vectorization
4342    is not profitable.  This returned value may be used for dynamic
4343    profitability check.
4344
4345    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4346    for static check against estimated number of iterations.  */
4347
4348 static void
4349 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4350                                     int *ret_min_profitable_niters,
4351                                     int *ret_min_profitable_estimate,
4352                                     unsigned *suggested_unroll_factor)
4353 {
4354   int min_profitable_iters;
4355   int min_profitable_estimate;
4356   int peel_iters_prologue;
4357   int peel_iters_epilogue;
4358   unsigned vec_inside_cost = 0;
4359   int vec_outside_cost = 0;
4360   unsigned vec_prologue_cost = 0;
4361   unsigned vec_epilogue_cost = 0;
4362   int scalar_single_iter_cost = 0;
4363   int scalar_outside_cost = 0;
4364   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4365   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4366   vector_costs *target_cost_data = loop_vinfo->vector_costs;
4367
4368   /* Cost model disabled.  */
4369   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4370     {
4371       if (dump_enabled_p ())
4372         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4373       *ret_min_profitable_niters = 0;
4374       *ret_min_profitable_estimate = 0;
4375       return;
4376     }
4377
4378   /* Requires loop versioning tests to handle misalignment.  */
4379   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4380     {
4381       /*  FIXME: Make cost depend on complexity of individual check.  */
4382       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4383       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4384       if (dump_enabled_p ())
4385         dump_printf (MSG_NOTE,
4386                      "cost model: Adding cost of checks for loop "
4387                      "versioning to treat misalignment.\n");
4388     }
4389
4390   /* Requires loop versioning with alias checks.  */
4391   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4392     {
4393       /*  FIXME: Make cost depend on complexity of individual check.  */
4394       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4395       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4396       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4397       if (len)
4398         /* Count LEN - 1 ANDs and LEN comparisons.  */
4399         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4400                               scalar_stmt, vect_prologue);
4401       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4402       if (len)
4403         {
4404           /* Count LEN - 1 ANDs and LEN comparisons.  */
4405           unsigned int nstmts = len * 2 - 1;
4406           /* +1 for each bias that needs adding.  */
4407           for (unsigned int i = 0; i < len; ++i)
4408             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4409               nstmts += 1;
4410           (void) add_stmt_cost (target_cost_data, nstmts,
4411                                 scalar_stmt, vect_prologue);
4412         }
4413       if (dump_enabled_p ())
4414         dump_printf (MSG_NOTE,
4415                      "cost model: Adding cost of checks for loop "
4416                      "versioning aliasing.\n");
4417     }
4418
4419   /* Requires loop versioning with niter checks.  */
4420   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4421     {
4422       /*  FIXME: Make cost depend on complexity of individual check.  */
4423       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4424                             NULL, NULL, NULL_TREE, 0, vect_prologue);
4425       if (dump_enabled_p ())
4426         dump_printf (MSG_NOTE,
4427                      "cost model: Adding cost of checks for loop "
4428                      "versioning niters.\n");
4429     }
4430
4431   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4432     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4433                           vect_prologue);
4434
4435   /* Count statements in scalar loop.  Using this as scalar cost for a single
4436      iteration for now.
4437
4438      TODO: Add outer loop support.
4439
4440      TODO: Consider assigning different costs to different scalar
4441      statements.  */
4442
4443   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4444
4445   /* Add additional cost for the peeled instructions in prologue and epilogue
4446      loop.  (For fully-masked loops there will be no peeling.)
4447
4448      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4449      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4450
4451      TODO: Build an expression that represents peel_iters for prologue and
4452      epilogue to be used in a run-time test.  */
4453
4454   bool prologue_need_br_taken_cost = false;
4455   bool prologue_need_br_not_taken_cost = false;
4456
4457   /* Calculate peel_iters_prologue.  */
4458   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4459     peel_iters_prologue = 0;
4460   else if (npeel < 0)
4461     {
4462       peel_iters_prologue = assumed_vf / 2;
4463       if (dump_enabled_p ())
4464         dump_printf (MSG_NOTE, "cost model: "
4465                      "prologue peel iters set to vf/2.\n");
4466
4467       /* If peeled iterations are unknown, count a taken branch and a not taken
4468          branch per peeled loop.  Even if scalar loop iterations are known,
4469          vector iterations are not known since peeled prologue iterations are
4470          not known.  Hence guards remain the same.  */
4471       prologue_need_br_taken_cost = true;
4472       prologue_need_br_not_taken_cost = true;
4473     }
4474   else
4475     {
4476       peel_iters_prologue = npeel;
4477       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4478         /* If peeled iterations are known but number of scalar loop
4479            iterations are unknown, count a taken branch per peeled loop.  */
4480         prologue_need_br_taken_cost = true;
4481     }
4482
4483   bool epilogue_need_br_taken_cost = false;
4484   bool epilogue_need_br_not_taken_cost = false;
4485
4486   /* Calculate peel_iters_epilogue.  */
4487   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4488     /* We need to peel exactly one iteration for gaps.  */
4489     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4490   else if (npeel < 0)
4491     {
4492       /* If peeling for alignment is unknown, loop bound of main loop
4493          becomes unknown.  */
4494       peel_iters_epilogue = assumed_vf / 2;
4495       if (dump_enabled_p ())
4496         dump_printf (MSG_NOTE, "cost model: "
4497                      "epilogue peel iters set to vf/2 because "
4498                      "peeling for alignment is unknown.\n");
4499
4500       /* See the same reason above in peel_iters_prologue calculation.  */
4501       epilogue_need_br_taken_cost = true;
4502       epilogue_need_br_not_taken_cost = true;
4503     }
4504   else
4505     {
4506       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4507       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4508         /* If peeled iterations are known but number of scalar loop
4509            iterations are unknown, count a taken branch per peeled loop.  */
4510         epilogue_need_br_taken_cost = true;
4511     }
4512
4513   stmt_info_for_cost *si;
4514   int j;
4515   /* Add costs associated with peel_iters_prologue.  */
4516   if (peel_iters_prologue)
4517     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4518       {
4519         (void) add_stmt_cost (target_cost_data,
4520                               si->count * peel_iters_prologue, si->kind,
4521                               si->stmt_info, si->node, si->vectype,
4522                               si->misalign, vect_prologue);
4523       }
4524
4525   /* Add costs associated with peel_iters_epilogue.  */
4526   if (peel_iters_epilogue)
4527     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4528       {
4529         (void) add_stmt_cost (target_cost_data,
4530                               si->count * peel_iters_epilogue, si->kind,
4531                               si->stmt_info, si->node, si->vectype,
4532                               si->misalign, vect_epilogue);
4533       }
4534
4535   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4536
4537   if (prologue_need_br_taken_cost)
4538     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4539                           vect_prologue);
4540
4541   if (prologue_need_br_not_taken_cost)
4542     (void) add_stmt_cost (target_cost_data, 1,
4543                           cond_branch_not_taken, vect_prologue);
4544
4545   if (epilogue_need_br_taken_cost)
4546     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4547                           vect_epilogue);
4548
4549   if (epilogue_need_br_not_taken_cost)
4550     (void) add_stmt_cost (target_cost_data, 1,
4551                           cond_branch_not_taken, vect_epilogue);
4552
4553   /* Take care of special costs for rgroup controls of partial vectors.  */
4554   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4555       && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4556           == vect_partial_vectors_avx512))
4557     {
4558       /* Calculate how many masks we need to generate.  */
4559       unsigned int num_masks = 0;
4560       bool need_saturation = false;
4561       for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4562         if (rgm.type)
4563           {
4564             unsigned nvectors = rgm.factor;
4565             num_masks += nvectors;
4566             if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4567                 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4568               need_saturation = true;
4569           }
4570
4571       /* ???  The target isn't able to identify the costs below as
4572          producing masks so it cannot penaltize cases where we'd run
4573          out of mask registers for example.  */
4574
4575       /* ???  We are also failing to account for smaller vector masks
4576          we generate by splitting larger masks in vect_get_loop_mask.  */
4577
4578       /* In the worst case, we need to generate each mask in the prologue
4579          and in the loop body.  We need one splat per group and one
4580          compare per mask.
4581
4582          Sometimes the prologue mask will fold to a constant,
4583          so the actual prologue cost might be smaller.  However, it's
4584          simpler and safer to use the worst-case cost; if this ends up
4585          being the tie-breaker between vectorizing or not, then it's
4586          probably better not to vectorize.  */
4587       (void) add_stmt_cost (target_cost_data,
4588                             num_masks
4589                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4590                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4591                             vect_prologue);
4592       (void) add_stmt_cost (target_cost_data,
4593                             num_masks
4594                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4595                             vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4596
4597       /* When we need saturation we need it both in the prologue and
4598          the epilogue.  */
4599       if (need_saturation)
4600         {
4601           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4602                                 NULL, NULL, NULL_TREE, 0, vect_prologue);
4603           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4604                                 NULL, NULL, NULL_TREE, 0, vect_body);
4605         }
4606     }
4607   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4608            && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4609                == vect_partial_vectors_while_ult))
4610     {
4611       /* Calculate how many masks we need to generate.  */
4612       unsigned int num_masks = 0;
4613       rgroup_controls *rgm;
4614       unsigned int num_vectors_m1;
4615       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4616                         num_vectors_m1, rgm)
4617         if (rgm->type)
4618           num_masks += num_vectors_m1 + 1;
4619       gcc_assert (num_masks > 0);
4620
4621       /* In the worst case, we need to generate each mask in the prologue
4622          and in the loop body.  One of the loop body mask instructions
4623          replaces the comparison in the scalar loop, and since we don't
4624          count the scalar comparison against the scalar body, we shouldn't
4625          count that vector instruction against the vector body either.
4626
4627          Sometimes we can use unpacks instead of generating prologue
4628          masks and sometimes the prologue mask will fold to a constant,
4629          so the actual prologue cost might be smaller.  However, it's
4630          simpler and safer to use the worst-case cost; if this ends up
4631          being the tie-breaker between vectorizing or not, then it's
4632          probably better not to vectorize.  */
4633       (void) add_stmt_cost (target_cost_data, num_masks,
4634                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4635                             vect_prologue);
4636       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4637                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4638                             vect_body);
4639     }
4640   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4641     {
4642       /* Referring to the functions vect_set_loop_condition_partial_vectors
4643          and vect_set_loop_controls_directly, we need to generate each
4644          length in the prologue and in the loop body if required. Although
4645          there are some possible optimizations, we consider the worst case
4646          here.  */
4647
4648       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4649       signed char partial_load_store_bias
4650         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4651       bool need_iterate_p
4652         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4653            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4654
4655       /* Calculate how many statements to be added.  */
4656       unsigned int prologue_stmts = 0;
4657       unsigned int body_stmts = 0;
4658
4659       rgroup_controls *rgc;
4660       unsigned int num_vectors_m1;
4661       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4662         if (rgc->type)
4663           {
4664             /* May need one SHIFT for nitems_total computation.  */
4665             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4666             if (nitems != 1 && !niters_known_p)
4667               prologue_stmts += 1;
4668
4669             /* May need one MAX and one MINUS for wrap around.  */
4670             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4671               prologue_stmts += 2;
4672
4673             /* Need one MAX and one MINUS for each batch limit excepting for
4674                the 1st one.  */
4675             prologue_stmts += num_vectors_m1 * 2;
4676
4677             unsigned int num_vectors = num_vectors_m1 + 1;
4678
4679             /* Need to set up lengths in prologue, only one MIN required
4680                for each since start index is zero.  */
4681             prologue_stmts += num_vectors;
4682
4683             /* If we have a non-zero partial load bias, we need one PLUS
4684                to adjust the load length.  */
4685             if (partial_load_store_bias != 0)
4686               body_stmts += 1;
4687
4688             /* Each may need two MINs and one MINUS to update lengths in body
4689                for next iteration.  */
4690             if (need_iterate_p)
4691               body_stmts += 3 * num_vectors;
4692           }
4693
4694       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4695                             scalar_stmt, vect_prologue);
4696       (void) add_stmt_cost (target_cost_data, body_stmts,
4697                             scalar_stmt, vect_body);
4698     }
4699
4700   /* FORNOW: The scalar outside cost is incremented in one of the
4701      following ways:
4702
4703      1. The vectorizer checks for alignment and aliasing and generates
4704      a condition that allows dynamic vectorization.  A cost model
4705      check is ANDED with the versioning condition.  Hence scalar code
4706      path now has the added cost of the versioning check.
4707
4708        if (cost > th & versioning_check)
4709          jmp to vector code
4710
4711      Hence run-time scalar is incremented by not-taken branch cost.
4712
4713      2. The vectorizer then checks if a prologue is required.  If the
4714      cost model check was not done before during versioning, it has to
4715      be done before the prologue check.
4716
4717        if (cost <= th)
4718          prologue = scalar_iters
4719        if (prologue == 0)
4720          jmp to vector code
4721        else
4722          execute prologue
4723        if (prologue == num_iters)
4724          go to exit
4725
4726      Hence the run-time scalar cost is incremented by a taken branch,
4727      plus a not-taken branch, plus a taken branch cost.
4728
4729      3. The vectorizer then checks if an epilogue is required.  If the
4730      cost model check was not done before during prologue check, it
4731      has to be done with the epilogue check.
4732
4733        if (prologue == 0)
4734          jmp to vector code
4735        else
4736          execute prologue
4737        if (prologue == num_iters)
4738          go to exit
4739        vector code:
4740          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4741            jmp to epilogue
4742
4743      Hence the run-time scalar cost should be incremented by 2 taken
4744      branches.
4745
4746      TODO: The back end may reorder the BBS's differently and reverse
4747      conditions/branch directions.  Change the estimates below to
4748      something more reasonable.  */
4749
4750   /* If the number of iterations is known and we do not do versioning, we can
4751      decide whether to vectorize at compile time.  Hence the scalar version
4752      do not carry cost model guard costs.  */
4753   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4754       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4755     {
4756       /* Cost model check occurs at versioning.  */
4757       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4758         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4759       else
4760         {
4761           /* Cost model check occurs at prologue generation.  */
4762           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4763             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4764               + vect_get_stmt_cost (cond_branch_not_taken);
4765           /* Cost model check occurs at epilogue generation.  */
4766           else
4767             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4768         }
4769     }
4770
4771   /* Complete the target-specific cost calculations.  */
4772   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4773                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4774                suggested_unroll_factor);
4775
4776   if (suggested_unroll_factor && *suggested_unroll_factor > 1
4777       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4778       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4779                     *suggested_unroll_factor,
4780                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4781     {
4782       if (dump_enabled_p ())
4783         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4784                          "can't unroll as unrolled vectorization factor larger"
4785                          " than maximum vectorization factor: "
4786                          HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4787                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4788       *suggested_unroll_factor = 1;
4789     }
4790
4791   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4792
4793   if (dump_enabled_p ())
4794     {
4795       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4796       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4797                    vec_inside_cost);
4798       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4799                    vec_prologue_cost);
4800       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4801                    vec_epilogue_cost);
4802       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4803                    scalar_single_iter_cost);
4804       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4805                    scalar_outside_cost);
4806       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4807                    vec_outside_cost);
4808       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4809                    peel_iters_prologue);
4810       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4811                    peel_iters_epilogue);
4812     }
4813
4814   /* Calculate number of iterations required to make the vector version
4815      profitable, relative to the loop bodies only.  The following condition
4816      must hold true:
4817      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4818      where
4819      SIC = scalar iteration cost, VIC = vector iteration cost,
4820      VOC = vector outside cost, VF = vectorization factor,
4821      NPEEL = prologue iterations + epilogue iterations,
4822      SOC = scalar outside cost for run time cost model check.  */
4823
4824   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4825                           - vec_inside_cost);
4826   if (saving_per_viter <= 0)
4827     {
4828       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4829         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4830                     "vectorization did not happen for a simd loop");
4831
4832       if (dump_enabled_p ())
4833         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4834                          "cost model: the vector iteration cost = %d "
4835                          "divided by the scalar iteration cost = %d "
4836                          "is greater or equal to the vectorization factor = %d"
4837                          ".\n",
4838                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4839       *ret_min_profitable_niters = -1;
4840       *ret_min_profitable_estimate = -1;
4841       return;
4842     }
4843
4844   /* ??? The "if" arm is written to handle all cases; see below for what
4845      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4846   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4847     {
4848       /* Rewriting the condition above in terms of the number of
4849          vector iterations (vniters) rather than the number of
4850          scalar iterations (niters) gives:
4851
4852          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4853
4854          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4855
4856          For integer N, X and Y when X > 0:
4857
4858          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4859       int outside_overhead = (vec_outside_cost
4860                               - scalar_single_iter_cost * peel_iters_prologue
4861                               - scalar_single_iter_cost * peel_iters_epilogue
4862                               - scalar_outside_cost);
4863       /* We're only interested in cases that require at least one
4864          vector iteration.  */
4865       int min_vec_niters = 1;
4866       if (outside_overhead > 0)
4867         min_vec_niters = outside_overhead / saving_per_viter + 1;
4868
4869       if (dump_enabled_p ())
4870         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4871                      min_vec_niters);
4872
4873       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4874         {
4875           /* Now that we know the minimum number of vector iterations,
4876              find the minimum niters for which the scalar cost is larger:
4877
4878              SIC * niters > VIC * vniters + VOC - SOC
4879
4880              We know that the minimum niters is no more than
4881              vniters * VF + NPEEL, but it might be (and often is) less
4882              than that if a partial vector iteration is cheaper than the
4883              equivalent scalar code.  */
4884           int threshold = (vec_inside_cost * min_vec_niters
4885                            + vec_outside_cost
4886                            - scalar_outside_cost);
4887           if (threshold <= 0)
4888             min_profitable_iters = 1;
4889           else
4890             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4891         }
4892       else
4893         /* Convert the number of vector iterations into a number of
4894            scalar iterations.  */
4895         min_profitable_iters = (min_vec_niters * assumed_vf
4896                                 + peel_iters_prologue
4897                                 + peel_iters_epilogue);
4898     }
4899   else
4900     {
4901       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4902                               * assumed_vf
4903                               - vec_inside_cost * peel_iters_prologue
4904                               - vec_inside_cost * peel_iters_epilogue);
4905       if (min_profitable_iters <= 0)
4906         min_profitable_iters = 0;
4907       else
4908         {
4909           min_profitable_iters /= saving_per_viter;
4910
4911           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4912               <= (((int) vec_inside_cost * min_profitable_iters)
4913                   + (((int) vec_outside_cost - scalar_outside_cost)
4914                      * assumed_vf)))
4915             min_profitable_iters++;
4916         }
4917     }
4918
4919   if (dump_enabled_p ())
4920     dump_printf (MSG_NOTE,
4921                  "  Calculated minimum iters for profitability: %d\n",
4922                  min_profitable_iters);
4923
4924   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4925       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4926     /* We want the vectorized loop to execute at least once.  */
4927     min_profitable_iters = assumed_vf + peel_iters_prologue;
4928   else if (min_profitable_iters < peel_iters_prologue)
4929     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4930        vectorized loop executes at least once.  */
4931     min_profitable_iters = peel_iters_prologue;
4932
4933   if (dump_enabled_p ())
4934     dump_printf_loc (MSG_NOTE, vect_location,
4935                      "  Runtime profitability threshold = %d\n",
4936                      min_profitable_iters);
4937
4938   *ret_min_profitable_niters = min_profitable_iters;
4939
4940   /* Calculate number of iterations required to make the vector version
4941      profitable, relative to the loop bodies only.
4942
4943      Non-vectorized variant is SIC * niters and it must win over vector
4944      variant on the expected loop trip count.  The following condition must hold true:
4945      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4946
4947   if (vec_outside_cost <= 0)
4948     min_profitable_estimate = 0;
4949   /* ??? This "else if" arm is written to handle all cases; see below for
4950      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4951   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4952     {
4953       /* This is a repeat of the code above, but with + SOC rather
4954          than - SOC.  */
4955       int outside_overhead = (vec_outside_cost
4956                               - scalar_single_iter_cost * peel_iters_prologue
4957                               - scalar_single_iter_cost * peel_iters_epilogue
4958                               + scalar_outside_cost);
4959       int min_vec_niters = 1;
4960       if (outside_overhead > 0)
4961         min_vec_niters = outside_overhead / saving_per_viter + 1;
4962
4963       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4964         {
4965           int threshold = (vec_inside_cost * min_vec_niters
4966                            + vec_outside_cost
4967                            + scalar_outside_cost);
4968           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4969         }
4970       else
4971         min_profitable_estimate = (min_vec_niters * assumed_vf
4972                                    + peel_iters_prologue
4973                                    + peel_iters_epilogue);
4974     }
4975   else
4976     {
4977       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4978                                  * assumed_vf
4979                                  - vec_inside_cost * peel_iters_prologue
4980                                  - vec_inside_cost * peel_iters_epilogue)
4981                                  / ((scalar_single_iter_cost * assumed_vf)
4982                                    - vec_inside_cost);
4983     }
4984   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4985   if (dump_enabled_p ())
4986     dump_printf_loc (MSG_NOTE, vect_location,
4987                      "  Static estimate profitability threshold = %d\n",
4988                      min_profitable_estimate);
4989
4990   *ret_min_profitable_estimate = min_profitable_estimate;
4991 }
4992
4993 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4994    vector elements (not bits) for a vector with NELT elements.  */
4995 static void
4996 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4997                               vec_perm_builder *sel)
4998 {
4999   /* The encoding is a single stepped pattern.  Any wrap-around is handled
5000      by vec_perm_indices.  */
5001   sel->new_vector (nelt, 1, 3);
5002   for (unsigned int i = 0; i < 3; i++)
5003     sel->quick_push (i + offset);
5004 }
5005
5006 /* Checks whether the target supports whole-vector shifts for vectors of mode
5007    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
5008    it supports vec_perm_const with masks for all necessary shift amounts.  */
5009 static bool
5010 have_whole_vector_shift (machine_mode mode)
5011 {
5012   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5013     return true;
5014
5015   /* Variable-length vectors should be handled via the optab.  */
5016   unsigned int nelt;
5017   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5018     return false;
5019
5020   vec_perm_builder sel;
5021   vec_perm_indices indices;
5022   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5023     {
5024       calc_vec_perm_mask_for_shift (i, nelt, &sel);
5025       indices.new_vector (sel, 2, nelt);
5026       if (!can_vec_perm_const_p (mode, mode, indices, false))
5027         return false;
5028     }
5029   return true;
5030 }
5031
5032 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5033    multiplication operands have differing signs and (b) we intend
5034    to emulate the operation using a series of signed DOT_PROD_EXPRs.
5035    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
5036
5037 static bool
5038 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5039                                  stmt_vec_info stmt_info)
5040 {
5041   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5042   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5043     return false;
5044
5045   tree rhs1 = gimple_assign_rhs1 (assign);
5046   tree rhs2 = gimple_assign_rhs2 (assign);
5047   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5048     return false;
5049
5050   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5051   gcc_assert (reduc_info->is_reduc_info);
5052   return !directly_supported_p (DOT_PROD_EXPR,
5053                                 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5054                                 optab_vector_mixed_sign);
5055 }
5056
5057 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5058    functions. Design better to avoid maintenance issues.  */
5059
5060 /* Function vect_model_reduction_cost.
5061
5062    Models cost for a reduction operation, including the vector ops
5063    generated within the strip-mine loop in some cases, the initial
5064    definition before the loop, and the epilogue code that must be generated.  */
5065
5066 static void
5067 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5068                            stmt_vec_info stmt_info, internal_fn reduc_fn,
5069                            vect_reduction_type reduction_type,
5070                            int ncopies, stmt_vector_for_cost *cost_vec)
5071 {
5072   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5073   tree vectype;
5074   machine_mode mode;
5075   class loop *loop = NULL;
5076
5077   if (loop_vinfo)
5078     loop = LOOP_VINFO_LOOP (loop_vinfo);
5079
5080   /* Condition reductions generate two reductions in the loop.  */
5081   if (reduction_type == COND_REDUCTION)
5082     ncopies *= 2;
5083
5084   vectype = STMT_VINFO_VECTYPE (stmt_info);
5085   mode = TYPE_MODE (vectype);
5086   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5087
5088   gimple_match_op op;
5089   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5090     gcc_unreachable ();
5091
5092   bool emulated_mixed_dot_prod
5093     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5094   if (reduction_type == EXTRACT_LAST_REDUCTION)
5095     /* No extra instructions are needed in the prologue.  The loop body
5096        operations are costed in vectorizable_condition.  */
5097     inside_cost = 0;
5098   else if (reduction_type == FOLD_LEFT_REDUCTION)
5099     {
5100       /* No extra instructions needed in the prologue.  */
5101       prologue_cost = 0;
5102
5103       if (reduc_fn != IFN_LAST)
5104         /* Count one reduction-like operation per vector.  */
5105         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5106                                         stmt_info, 0, vect_body);
5107       else
5108         {
5109           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
5110           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5111           inside_cost = record_stmt_cost (cost_vec, nelements,
5112                                           vec_to_scalar, stmt_info, 0,
5113                                           vect_body);
5114           inside_cost += record_stmt_cost (cost_vec, nelements,
5115                                            scalar_stmt, stmt_info, 0,
5116                                            vect_body);
5117         }
5118     }
5119   else
5120     {
5121       /* Add in the cost of the initial definitions.  */
5122       int prologue_stmts;
5123       if (reduction_type == COND_REDUCTION)
5124         /* For cond reductions we have four vectors: initial index, step,
5125            initial result of the data reduction, initial value of the index
5126            reduction.  */
5127         prologue_stmts = 4;
5128       else if (emulated_mixed_dot_prod)
5129         /* We need the initial reduction value and two invariants:
5130            one that contains the minimum signed value and one that
5131            contains half of its negative.  */
5132         prologue_stmts = 3;
5133       else
5134         prologue_stmts = 1;
5135       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5136                                          scalar_to_vec, stmt_info, 0,
5137                                          vect_prologue);
5138     }
5139
5140   /* Determine cost of epilogue code.
5141
5142      We have a reduction operator that will reduce the vector in one statement.
5143      Also requires scalar extract.  */
5144
5145   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5146     {
5147       if (reduc_fn != IFN_LAST)
5148         {
5149           if (reduction_type == COND_REDUCTION)
5150             {
5151               /* An EQ stmt and an COND_EXPR stmt.  */
5152               epilogue_cost += record_stmt_cost (cost_vec, 2,
5153                                                  vector_stmt, stmt_info, 0,
5154                                                  vect_epilogue);
5155               /* Reduction of the max index and a reduction of the found
5156                  values.  */
5157               epilogue_cost += record_stmt_cost (cost_vec, 2,
5158                                                  vec_to_scalar, stmt_info, 0,
5159                                                  vect_epilogue);
5160               /* A broadcast of the max value.  */
5161               epilogue_cost += record_stmt_cost (cost_vec, 1,
5162                                                  scalar_to_vec, stmt_info, 0,
5163                                                  vect_epilogue);
5164             }
5165           else
5166             {
5167               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5168                                                  stmt_info, 0, vect_epilogue);
5169               epilogue_cost += record_stmt_cost (cost_vec, 1,
5170                                                  vec_to_scalar, stmt_info, 0,
5171                                                  vect_epilogue);
5172             }
5173         }
5174       else if (reduction_type == COND_REDUCTION)
5175         {
5176           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5177           /* Extraction of scalar elements.  */
5178           epilogue_cost += record_stmt_cost (cost_vec,
5179                                              2 * estimated_nunits,
5180                                              vec_to_scalar, stmt_info, 0,
5181                                              vect_epilogue);
5182           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
5183           epilogue_cost += record_stmt_cost (cost_vec,
5184                                              2 * estimated_nunits - 3,
5185                                              scalar_stmt, stmt_info, 0,
5186                                              vect_epilogue);
5187         }
5188       else if (reduction_type == EXTRACT_LAST_REDUCTION
5189                || reduction_type == FOLD_LEFT_REDUCTION)
5190         /* No extra instructions need in the epilogue.  */
5191         ;
5192       else
5193         {
5194           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5195           tree bitsize = TYPE_SIZE (op.type);
5196           int element_bitsize = tree_to_uhwi (bitsize);
5197           int nelements = vec_size_in_bits / element_bitsize;
5198
5199           if (op.code == COND_EXPR)
5200             op.code = MAX_EXPR;
5201
5202           /* We have a whole vector shift available.  */
5203           if (VECTOR_MODE_P (mode)
5204               && directly_supported_p (op.code, vectype)
5205               && have_whole_vector_shift (mode))
5206             {
5207               /* Final reduction via vector shifts and the reduction operator.
5208                  Also requires scalar extract.  */
5209               epilogue_cost += record_stmt_cost (cost_vec,
5210                                                  exact_log2 (nelements) * 2,
5211                                                  vector_stmt, stmt_info, 0,
5212                                                  vect_epilogue);
5213               epilogue_cost += record_stmt_cost (cost_vec, 1,
5214                                                  vec_to_scalar, stmt_info, 0,
5215                                                  vect_epilogue);
5216             }
5217           else
5218             /* Use extracts and reduction op for final reduction.  For N
5219                elements, we have N extracts and N-1 reduction ops.  */
5220             epilogue_cost += record_stmt_cost (cost_vec,
5221                                                nelements + nelements - 1,
5222                                                vector_stmt, stmt_info, 0,
5223                                                vect_epilogue);
5224         }
5225     }
5226
5227   if (dump_enabled_p ())
5228     dump_printf (MSG_NOTE,
5229                  "vect_model_reduction_cost: inside_cost = %d, "
5230                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5231                  prologue_cost, epilogue_cost);
5232 }
5233
5234 /* SEQ is a sequence of instructions that initialize the reduction
5235    described by REDUC_INFO.  Emit them in the appropriate place.  */
5236
5237 static void
5238 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5239                                 stmt_vec_info reduc_info, gimple *seq)
5240 {
5241   if (reduc_info->reused_accumulator)
5242     {
5243       /* When reusing an accumulator from the main loop, we only need
5244          initialization instructions if the main loop can be skipped.
5245          In that case, emit the initialization instructions at the end
5246          of the guard block that does the skip.  */
5247       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5248       gcc_assert (skip_edge);
5249       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5250       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5251     }
5252   else
5253     {
5254       /* The normal case: emit the initialization instructions on the
5255          preheader edge.  */
5256       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5257       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5258     }
5259 }
5260
5261 /* Function get_initial_def_for_reduction
5262
5263    Input:
5264    REDUC_INFO - the info_for_reduction
5265    INIT_VAL - the initial value of the reduction variable
5266    NEUTRAL_OP - a value that has no effect on the reduction, as per
5267                 neutral_op_for_reduction
5268
5269    Output:
5270    Return a vector variable, initialized according to the operation that
5271         STMT_VINFO performs. This vector will be used as the initial value
5272         of the vector of partial results.
5273
5274    The value we need is a vector in which element 0 has value INIT_VAL
5275    and every other element has value NEUTRAL_OP.  */
5276
5277 static tree
5278 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5279                                stmt_vec_info reduc_info,
5280                                tree init_val, tree neutral_op)
5281 {
5282   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5283   tree scalar_type = TREE_TYPE (init_val);
5284   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5285   tree init_def;
5286   gimple_seq stmts = NULL;
5287
5288   gcc_assert (vectype);
5289
5290   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5291               || SCALAR_FLOAT_TYPE_P (scalar_type));
5292
5293   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5294               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5295
5296   if (operand_equal_p (init_val, neutral_op))
5297     {
5298       /* If both elements are equal then the vector described above is
5299          just a splat.  */
5300       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5301       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5302     }
5303   else
5304     {
5305       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5306       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5307       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5308         {
5309           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5310              element 0.  */
5311           init_def = gimple_build_vector_from_val (&stmts, vectype,
5312                                                    neutral_op);
5313           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5314                                    vectype, init_def, init_val);
5315         }
5316       else
5317         {
5318           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
5319           tree_vector_builder elts (vectype, 1, 2);
5320           elts.quick_push (init_val);
5321           elts.quick_push (neutral_op);
5322           init_def = gimple_build_vector (&stmts, &elts);
5323         }
5324     }
5325
5326   if (stmts)
5327     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5328   return init_def;
5329 }
5330
5331 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5332    which performs a reduction involving GROUP_SIZE scalar statements.
5333    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
5334    is nonnull, introducing extra elements of that value will not change the
5335    result.  */
5336
5337 static void
5338 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5339                                 stmt_vec_info reduc_info,
5340                                 vec<tree> *vec_oprnds,
5341                                 unsigned int number_of_vectors,
5342                                 unsigned int group_size, tree neutral_op)
5343 {
5344   vec<tree> &initial_values = reduc_info->reduc_initial_values;
5345   unsigned HOST_WIDE_INT nunits;
5346   unsigned j, number_of_places_left_in_vector;
5347   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5348   unsigned int i;
5349
5350   gcc_assert (group_size == initial_values.length () || neutral_op);
5351
5352   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5353      created vectors. It is greater than 1 if unrolling is performed.
5354
5355      For example, we have two scalar operands, s1 and s2 (e.g., group of
5356      strided accesses of size two), while NUNITS is four (i.e., four scalars
5357      of this type can be packed in a vector).  The output vector will contain
5358      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
5359      will be 2).
5360
5361      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5362      vectors containing the operands.
5363
5364      For example, NUNITS is four as before, and the group size is 8
5365      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
5366      {s5, s6, s7, s8}.  */
5367
5368   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5369     nunits = group_size;
5370
5371   number_of_places_left_in_vector = nunits;
5372   bool constant_p = true;
5373   tree_vector_builder elts (vector_type, nunits, 1);
5374   elts.quick_grow (nunits);
5375   gimple_seq ctor_seq = NULL;
5376   for (j = 0; j < nunits * number_of_vectors; ++j)
5377     {
5378       tree op;
5379       i = j % group_size;
5380
5381       /* Get the def before the loop.  In reduction chain we have only
5382          one initial value.  Else we have as many as PHIs in the group.  */
5383       if (i >= initial_values.length () || (j > i && neutral_op))
5384         op = neutral_op;
5385       else
5386         op = initial_values[i];
5387
5388       /* Create 'vect_ = {op0,op1,...,opn}'.  */
5389       number_of_places_left_in_vector--;
5390       elts[nunits - number_of_places_left_in_vector - 1] = op;
5391       if (!CONSTANT_CLASS_P (op))
5392         constant_p = false;
5393
5394       if (number_of_places_left_in_vector == 0)
5395         {
5396           tree init;
5397           if (constant_p && !neutral_op
5398               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5399               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5400             /* Build the vector directly from ELTS.  */
5401             init = gimple_build_vector (&ctor_seq, &elts);
5402           else if (neutral_op)
5403             {
5404               /* Build a vector of the neutral value and shift the
5405                  other elements into place.  */
5406               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5407                                                    neutral_op);
5408               int k = nunits;
5409               while (k > 0 && elts[k - 1] == neutral_op)
5410                 k -= 1;
5411               while (k > 0)
5412                 {
5413                   k -= 1;
5414                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5415                                        vector_type, init, elts[k]);
5416                 }
5417             }
5418           else
5419             {
5420               /* First time round, duplicate ELTS to fill the
5421                  required number of vectors.  */
5422               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5423                                         elts, number_of_vectors, *vec_oprnds);
5424               break;
5425             }
5426           vec_oprnds->quick_push (init);
5427
5428           number_of_places_left_in_vector = nunits;
5429           elts.new_vector (vector_type, nunits, 1);
5430           elts.quick_grow (nunits);
5431           constant_p = true;
5432         }
5433     }
5434   if (ctor_seq != NULL)
5435     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5436 }
5437
5438 /* For a statement STMT_INFO taking part in a reduction operation return
5439    the stmt_vec_info the meta information is stored on.  */
5440
5441 stmt_vec_info
5442 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5443 {
5444   stmt_info = vect_orig_stmt (stmt_info);
5445   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5446   if (!is_a <gphi *> (stmt_info->stmt)
5447       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5448     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5449   gphi *phi = as_a <gphi *> (stmt_info->stmt);
5450   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5451     {
5452       if (gimple_phi_num_args (phi) == 1)
5453         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5454     }
5455   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5456     {
5457       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5458       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5459         stmt_info = info;
5460     }
5461   return stmt_info;
5462 }
5463
5464 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5465    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
5466    return false.  */
5467
5468 static bool
5469 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5470                                 stmt_vec_info reduc_info)
5471 {
5472   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5473   if (!main_loop_vinfo)
5474     return false;
5475
5476   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5477     return false;
5478
5479   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5480   auto_vec<tree, 16> main_loop_results (num_phis);
5481   auto_vec<tree, 16> initial_values (num_phis);
5482   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5483     {
5484       /* The epilogue loop can be entered either from the main loop or
5485          from an earlier guard block.  */
5486       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5487       for (tree incoming_value : reduc_info->reduc_initial_values)
5488         {
5489           /* Look for:
5490
5491                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5492                                     INITIAL_VALUE(guard block)>.  */
5493           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5494
5495           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5496           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5497
5498           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5499           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5500
5501           main_loop_results.quick_push (from_main_loop);
5502           initial_values.quick_push (from_skip);
5503         }
5504     }
5505   else
5506     /* The main loop dominates the epilogue loop.  */
5507     main_loop_results.splice (reduc_info->reduc_initial_values);
5508
5509   /* See if the main loop has the kind of accumulator we need.  */
5510   vect_reusable_accumulator *accumulator
5511     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5512   if (!accumulator
5513       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5514       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5515                       accumulator->reduc_info->reduc_scalar_results.begin ()))
5516     return false;
5517
5518   /* Handle the case where we can reduce wider vectors to narrower ones.  */
5519   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5520   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5521   unsigned HOST_WIDE_INT m;
5522   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5523                             TYPE_VECTOR_SUBPARTS (vectype), &m))
5524     return false;
5525   /* Check the intermediate vector types and operations are available.  */
5526   tree prev_vectype = old_vectype;
5527   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5528   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5529     {
5530       intermediate_nunits = exact_div (intermediate_nunits, 2);
5531       tree intermediate_vectype = get_related_vectype_for_scalar_type
5532         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5533       if (!intermediate_vectype
5534           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5535                                     intermediate_vectype)
5536           || !can_vec_extract (TYPE_MODE (prev_vectype),
5537                                TYPE_MODE (intermediate_vectype)))
5538         return false;
5539       prev_vectype = intermediate_vectype;
5540     }
5541
5542   /* Non-SLP reductions might apply an adjustment after the reduction
5543      operation, in order to simplify the initialization of the accumulator.
5544      If the epilogue loop carries on from where the main loop left off,
5545      it should apply the same adjustment to the final reduction result.
5546
5547      If the epilogue loop can also be entered directly (rather than via
5548      the main loop), we need to be able to handle that case in the same way,
5549      with the same adjustment.  (In principle we could add a PHI node
5550      to select the correct adjustment, but in practice that shouldn't be
5551      necessary.)  */
5552   tree main_adjustment
5553     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5554   if (loop_vinfo->main_loop_edge && main_adjustment)
5555     {
5556       gcc_assert (num_phis == 1);
5557       tree initial_value = initial_values[0];
5558       /* Check that we can use INITIAL_VALUE as the adjustment and
5559          initialize the accumulator with a neutral value instead.  */
5560       if (!operand_equal_p (initial_value, main_adjustment))
5561         return false;
5562       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5563       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5564                                                     code, initial_value);
5565     }
5566   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5567   reduc_info->reduc_initial_values.truncate (0);
5568   reduc_info->reduc_initial_values.splice (initial_values);
5569   reduc_info->reused_accumulator = accumulator;
5570   return true;
5571 }
5572
5573 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5574    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5575
5576 static tree
5577 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5578                             gimple_seq *seq)
5579 {
5580   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5581   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5582   tree stype = TREE_TYPE (vectype);
5583   tree new_temp = vec_def;
5584   while (nunits > nunits1)
5585     {
5586       nunits /= 2;
5587       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5588                                                            stype, nunits);
5589       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5590
5591       /* The target has to make sure we support lowpart/highpart
5592          extraction, either via direct vector extract or through
5593          an integer mode punning.  */
5594       tree dst1, dst2;
5595       gimple *epilog_stmt;
5596       if (convert_optab_handler (vec_extract_optab,
5597                                  TYPE_MODE (TREE_TYPE (new_temp)),
5598                                  TYPE_MODE (vectype1))
5599           != CODE_FOR_nothing)
5600         {
5601           /* Extract sub-vectors directly once vec_extract becomes
5602              a conversion optab.  */
5603           dst1 = make_ssa_name (vectype1);
5604           epilog_stmt
5605               = gimple_build_assign (dst1, BIT_FIELD_REF,
5606                                      build3 (BIT_FIELD_REF, vectype1,
5607                                              new_temp, TYPE_SIZE (vectype1),
5608                                              bitsize_int (0)));
5609           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5610           dst2 =  make_ssa_name (vectype1);
5611           epilog_stmt
5612               = gimple_build_assign (dst2, BIT_FIELD_REF,
5613                                      build3 (BIT_FIELD_REF, vectype1,
5614                                              new_temp, TYPE_SIZE (vectype1),
5615                                              bitsize_int (bitsize)));
5616           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5617         }
5618       else
5619         {
5620           /* Extract via punning to appropriately sized integer mode
5621              vector.  */
5622           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5623           tree etype = build_vector_type (eltype, 2);
5624           gcc_assert (convert_optab_handler (vec_extract_optab,
5625                                              TYPE_MODE (etype),
5626                                              TYPE_MODE (eltype))
5627                       != CODE_FOR_nothing);
5628           tree tem = make_ssa_name (etype);
5629           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5630                                              build1 (VIEW_CONVERT_EXPR,
5631                                                      etype, new_temp));
5632           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5633           new_temp = tem;
5634           tem = make_ssa_name (eltype);
5635           epilog_stmt
5636               = gimple_build_assign (tem, BIT_FIELD_REF,
5637                                      build3 (BIT_FIELD_REF, eltype,
5638                                              new_temp, TYPE_SIZE (eltype),
5639                                              bitsize_int (0)));
5640           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5641           dst1 = make_ssa_name (vectype1);
5642           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5643                                              build1 (VIEW_CONVERT_EXPR,
5644                                                      vectype1, tem));
5645           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5646           tem = make_ssa_name (eltype);
5647           epilog_stmt
5648               = gimple_build_assign (tem, BIT_FIELD_REF,
5649                                      build3 (BIT_FIELD_REF, eltype,
5650                                              new_temp, TYPE_SIZE (eltype),
5651                                              bitsize_int (bitsize)));
5652           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5653           dst2 =  make_ssa_name (vectype1);
5654           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5655                                              build1 (VIEW_CONVERT_EXPR,
5656                                                      vectype1, tem));
5657           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5658         }
5659
5660       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5661     }
5662
5663   return new_temp;
5664 }
5665
5666 /* Function vect_create_epilog_for_reduction
5667
5668    Create code at the loop-epilog to finalize the result of a reduction
5669    computation.
5670
5671    STMT_INFO is the scalar reduction stmt that is being vectorized.
5672    SLP_NODE is an SLP node containing a group of reduction statements. The
5673      first one in this group is STMT_INFO.
5674    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5675    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5676      (counting from 0)
5677
5678    This function:
5679    1. Completes the reduction def-use cycles.
5680    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5681       by calling the function specified by REDUC_FN if available, or by
5682       other means (whole-vector shifts or a scalar loop).
5683       The function also creates a new phi node at the loop exit to preserve
5684       loop-closed form, as illustrated below.
5685
5686      The flow at the entry to this function:
5687
5688         loop:
5689           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5690           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5691           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5692         loop_exit:
5693           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5694           use <s_out0>
5695           use <s_out0>
5696
5697      The above is transformed by this function into:
5698
5699         loop:
5700           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5701           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5702           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5703         loop_exit:
5704           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5705           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5706           v_out2 = reduce <v_out1>
5707           s_out3 = extract_field <v_out2, 0>
5708           s_out4 = adjust_result <s_out3>
5709           use <s_out4>
5710           use <s_out4>
5711 */
5712
5713 static void
5714 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5715                                   stmt_vec_info stmt_info,
5716                                   slp_tree slp_node,
5717                                   slp_instance slp_node_instance)
5718 {
5719   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5720   gcc_assert (reduc_info->is_reduc_info);
5721   /* For double reductions we need to get at the inner loop reduction
5722      stmt which has the meta info attached.  Our stmt_info is that of the
5723      loop-closed PHI of the inner loop which we remember as
5724      def for the reduction PHI generation.  */
5725   bool double_reduc = false;
5726   stmt_vec_info rdef_info = stmt_info;
5727   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5728     {
5729       gcc_assert (!slp_node);
5730       double_reduc = true;
5731       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5732                                             (stmt_info->stmt, 0));
5733       stmt_info = vect_stmt_to_vectorize (stmt_info);
5734     }
5735   gphi *reduc_def_stmt
5736     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5737   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5738   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5739   tree vectype;
5740   machine_mode mode;
5741   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5742   basic_block exit_bb;
5743   tree scalar_dest;
5744   tree scalar_type;
5745   gimple *new_phi = NULL, *phi;
5746   gimple_stmt_iterator exit_gsi;
5747   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5748   gimple *epilog_stmt = NULL;
5749   gimple *exit_phi;
5750   tree bitsize;
5751   tree def;
5752   tree orig_name, scalar_result;
5753   imm_use_iterator imm_iter, phi_imm_iter;
5754   use_operand_p use_p, phi_use_p;
5755   gimple *use_stmt;
5756   auto_vec<tree> reduc_inputs;
5757   int j, i;
5758   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5759   unsigned int group_size = 1, k;
5760   auto_vec<gimple *> phis;
5761   /* SLP reduction without reduction chain, e.g.,
5762      # a1 = phi <a2, a0>
5763      # b1 = phi <b2, b0>
5764      a2 = operation (a1)
5765      b2 = operation (b1)  */
5766   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5767   bool direct_slp_reduc;
5768   tree induction_index = NULL_TREE;
5769
5770   if (slp_node)
5771     group_size = SLP_TREE_LANES (slp_node);
5772
5773   if (nested_in_vect_loop_p (loop, stmt_info))
5774     {
5775       outer_loop = loop;
5776       loop = loop->inner;
5777       gcc_assert (!slp_node && double_reduc);
5778     }
5779
5780   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5781   gcc_assert (vectype);
5782   mode = TYPE_MODE (vectype);
5783
5784   tree induc_val = NULL_TREE;
5785   tree adjustment_def = NULL;
5786   if (slp_node)
5787     ;
5788   else
5789     {
5790       /* Optimize: for induction condition reduction, if we can't use zero
5791          for induc_val, use initial_def.  */
5792       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5793         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5794       else if (double_reduc)
5795         ;
5796       else
5797         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5798     }
5799
5800   stmt_vec_info single_live_out_stmt[] = { stmt_info };
5801   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5802   if (slp_reduc)
5803     /* All statements produce live-out values.  */
5804     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5805   else if (slp_node)
5806     {
5807       /* The last statement in the reduction chain produces the live-out
5808          value.  Note SLP optimization can shuffle scalar stmts to
5809          optimize permutations so we have to search for the last stmt.  */
5810       for (k = 0; k < group_size; ++k)
5811         if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5812           {
5813             single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5814             break;
5815           }
5816     }
5817
5818   unsigned vec_num;
5819   int ncopies;
5820   if (slp_node)
5821     {
5822       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5823       ncopies = 1;
5824     }
5825   else
5826     {
5827       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5828       vec_num = 1;
5829       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5830     }
5831
5832   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5833      which is updated with the current index of the loop for every match of
5834      the original loop's cond_expr (VEC_STMT).  This results in a vector
5835      containing the last time the condition passed for that vector lane.
5836      The first match will be a 1 to allow 0 to be used for non-matching
5837      indexes.  If there are no matches at all then the vector will be all
5838      zeroes.
5839
5840      PR92772: This algorithm is broken for architectures that support
5841      masked vectors, but do not provide fold_extract_last.  */
5842   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5843     {
5844       auto_vec<std::pair<tree, bool>, 2> ccompares;
5845       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5846       cond_info = vect_stmt_to_vectorize (cond_info);
5847       while (cond_info != reduc_info)
5848         {
5849           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5850             {
5851               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5852               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5853               ccompares.safe_push
5854                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5855                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
5856             }
5857           cond_info
5858             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5859                                                  1 + STMT_VINFO_REDUC_IDX
5860                                                         (cond_info)));
5861           cond_info = vect_stmt_to_vectorize (cond_info);
5862         }
5863       gcc_assert (ccompares.length () != 0);
5864
5865       tree indx_before_incr, indx_after_incr;
5866       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5867       int scalar_precision
5868         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5869       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5870       tree cr_index_vector_type = get_related_vectype_for_scalar_type
5871         (TYPE_MODE (vectype), cr_index_scalar_type,
5872          TYPE_VECTOR_SUBPARTS (vectype));
5873
5874       /* First we create a simple vector induction variable which starts
5875          with the values {1,2,3,...} (SERIES_VECT) and increments by the
5876          vector size (STEP).  */
5877
5878       /* Create a {1,2,3,...} vector.  */
5879       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5880
5881       /* Create a vector of the step value.  */
5882       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5883       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5884
5885       /* Create an induction variable.  */
5886       gimple_stmt_iterator incr_gsi;
5887       bool insert_after;
5888       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5889       create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
5890                  insert_after, &indx_before_incr, &indx_after_incr);
5891
5892       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5893          filled with zeros (VEC_ZERO).  */
5894
5895       /* Create a vector of 0s.  */
5896       tree zero = build_zero_cst (cr_index_scalar_type);
5897       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5898
5899       /* Create a vector phi node.  */
5900       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5901       new_phi = create_phi_node (new_phi_tree, loop->header);
5902       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5903                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
5904
5905       /* Now take the condition from the loops original cond_exprs
5906          and produce a new cond_exprs (INDEX_COND_EXPR) which for
5907          every match uses values from the induction variable
5908          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5909          (NEW_PHI_TREE).
5910          Finally, we update the phi (NEW_PHI_TREE) to take the value of
5911          the new cond_expr (INDEX_COND_EXPR).  */
5912       gimple_seq stmts = NULL;
5913       for (int i = ccompares.length () - 1; i != -1; --i)
5914         {
5915           tree ccompare = ccompares[i].first;
5916           if (ccompares[i].second)
5917             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5918                                          cr_index_vector_type,
5919                                          ccompare,
5920                                          indx_before_incr, new_phi_tree);
5921           else
5922             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5923                                          cr_index_vector_type,
5924                                          ccompare,
5925                                          new_phi_tree, indx_before_incr);
5926         }
5927       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5928
5929       /* Update the phi with the vec cond.  */
5930       induction_index = new_phi_tree;
5931       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5932                    loop_latch_edge (loop), UNKNOWN_LOCATION);
5933     }
5934
5935   /* 2. Create epilog code.
5936         The reduction epilog code operates across the elements of the vector
5937         of partial results computed by the vectorized loop.
5938         The reduction epilog code consists of:
5939
5940         step 1: compute the scalar result in a vector (v_out2)
5941         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5942         step 3: adjust the scalar result (s_out3) if needed.
5943
5944         Step 1 can be accomplished using one the following three schemes:
5945           (scheme 1) using reduc_fn, if available.
5946           (scheme 2) using whole-vector shifts, if available.
5947           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5948                      combined.
5949
5950           The overall epilog code looks like this:
5951
5952           s_out0 = phi <s_loop>         # original EXIT_PHI
5953           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5954           v_out2 = reduce <v_out1>              # step 1
5955           s_out3 = extract_field <v_out2, 0>    # step 2
5956           s_out4 = adjust_result <s_out3>       # step 3
5957
5958           (step 3 is optional, and steps 1 and 2 may be combined).
5959           Lastly, the uses of s_out0 are replaced by s_out4.  */
5960
5961
5962   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5963          v_out1 = phi <VECT_DEF>
5964          Store them in NEW_PHIS.  */
5965   if (double_reduc)
5966     loop = outer_loop;
5967   exit_bb = single_exit (loop)->dest;
5968   exit_gsi = gsi_after_labels (exit_bb);
5969   reduc_inputs.create (slp_node ? vec_num : ncopies);
5970   for (unsigned i = 0; i < vec_num; i++)
5971     {
5972       gimple_seq stmts = NULL;
5973       if (slp_node)
5974         def = vect_get_slp_vect_def (slp_node, i);
5975       else
5976         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5977       for (j = 0; j < ncopies; j++)
5978         {
5979           tree new_def = copy_ssa_name (def);
5980           phi = create_phi_node (new_def, exit_bb);
5981           if (j)
5982             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5983           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5984           new_def = gimple_convert (&stmts, vectype, new_def);
5985           reduc_inputs.quick_push (new_def);
5986         }
5987       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5988     }
5989
5990   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5991          (i.e. when reduc_fn is not available) and in the final adjustment
5992          code (if needed).  Also get the original scalar reduction variable as
5993          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5994          represents a reduction pattern), the tree-code and scalar-def are
5995          taken from the original stmt that the pattern-stmt (STMT) replaces.
5996          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5997          are taken from STMT.  */
5998
5999   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6000   if (orig_stmt_info != stmt_info)
6001     {
6002       /* Reduction pattern  */
6003       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6004       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6005     }
6006
6007   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6008   scalar_type = TREE_TYPE (scalar_dest);
6009   scalar_results.truncate (0);
6010   scalar_results.reserve_exact (group_size);
6011   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6012   bitsize = TYPE_SIZE (scalar_type);
6013
6014   /* True if we should implement SLP_REDUC using native reduction operations
6015      instead of scalar operations.  */
6016   direct_slp_reduc = (reduc_fn != IFN_LAST
6017                       && slp_reduc
6018                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6019
6020   /* In case of reduction chain, e.g.,
6021      # a1 = phi <a3, a0>
6022      a2 = operation (a1)
6023      a3 = operation (a2),
6024
6025      we may end up with more than one vector result.  Here we reduce them
6026      to one vector.
6027
6028      The same is true for a SLP reduction, e.g.,
6029      # a1 = phi <a2, a0>
6030      # b1 = phi <b2, b0>
6031      a2 = operation (a1)
6032      b2 = operation (a2),
6033
6034      where we can end up with more than one vector as well.  We can
6035      easily accumulate vectors when the number of vector elements is
6036      a multiple of the SLP group size.
6037
6038      The same is true if we couldn't use a single defuse cycle.  */
6039   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6040       || direct_slp_reduc
6041       || (slp_reduc
6042           && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6043       || ncopies > 1)
6044     {
6045       gimple_seq stmts = NULL;
6046       tree single_input = reduc_inputs[0];
6047       for (k = 1; k < reduc_inputs.length (); k++)
6048         single_input = gimple_build (&stmts, code, vectype,
6049                                      single_input, reduc_inputs[k]);
6050       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6051
6052       reduc_inputs.truncate (0);
6053       reduc_inputs.safe_push (single_input);
6054     }
6055
6056   tree orig_reduc_input = reduc_inputs[0];
6057
6058   /* If this loop is an epilogue loop that can be skipped after the
6059      main loop, we can only share a reduction operation between the
6060      main loop and the epilogue if we put it at the target of the
6061      skip edge.
6062
6063      We can still reuse accumulators if this check fails.  Doing so has
6064      the minor(?) benefit of making the epilogue loop's scalar result
6065      independent of the main loop's scalar result.  */
6066   bool unify_with_main_loop_p = false;
6067   if (reduc_info->reused_accumulator
6068       && loop_vinfo->skip_this_loop_edge
6069       && single_succ_p (exit_bb)
6070       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6071     {
6072       unify_with_main_loop_p = true;
6073
6074       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6075       reduc_inputs[0] = make_ssa_name (vectype);
6076       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6077       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6078                    UNKNOWN_LOCATION);
6079       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6080                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6081       exit_gsi = gsi_after_labels (reduc_block);
6082     }
6083
6084   /* Shouldn't be used beyond this point.  */
6085   exit_bb = nullptr;
6086
6087   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6088       && reduc_fn != IFN_LAST)
6089     {
6090       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6091          various data values where the condition matched and another vector
6092          (INDUCTION_INDEX) containing all the indexes of those matches.  We
6093          need to extract the last matching index (which will be the index with
6094          highest value) and use this to index into the data vector.
6095          For the case where there were no matches, the data vector will contain
6096          all default values and the index vector will be all zeros.  */
6097
6098       /* Get various versions of the type of the vector of indexes.  */
6099       tree index_vec_type = TREE_TYPE (induction_index);
6100       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6101       tree index_scalar_type = TREE_TYPE (index_vec_type);
6102       tree index_vec_cmp_type = truth_type_for (index_vec_type);
6103
6104       /* Get an unsigned integer version of the type of the data vector.  */
6105       int scalar_precision
6106         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6107       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6108       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6109                                                 vectype);
6110
6111       /* First we need to create a vector (ZERO_VEC) of zeros and another
6112          vector (MAX_INDEX_VEC) filled with the last matching index, which we
6113          can create using a MAX reduction and then expanding.
6114          In the case where the loop never made any matches, the max index will
6115          be zero.  */
6116
6117       /* Vector of {0, 0, 0,...}.  */
6118       tree zero_vec = build_zero_cst (vectype);
6119
6120       /* Find maximum value from the vector of found indexes.  */
6121       tree max_index = make_ssa_name (index_scalar_type);
6122       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6123                                                           1, induction_index);
6124       gimple_call_set_lhs (max_index_stmt, max_index);
6125       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6126
6127       /* Vector of {max_index, max_index, max_index,...}.  */
6128       tree max_index_vec = make_ssa_name (index_vec_type);
6129       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6130                                                       max_index);
6131       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6132                                                         max_index_vec_rhs);
6133       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6134
6135       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6136          with the vector (INDUCTION_INDEX) of found indexes, choosing values
6137          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6138          otherwise.  Only one value should match, resulting in a vector
6139          (VEC_COND) with one data value and the rest zeros.
6140          In the case where the loop never made any matches, every index will
6141          match, resulting in a vector with all data values (which will all be
6142          the default value).  */
6143
6144       /* Compare the max index vector to the vector of found indexes to find
6145          the position of the max value.  */
6146       tree vec_compare = make_ssa_name (index_vec_cmp_type);
6147       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6148                                                       induction_index,
6149                                                       max_index_vec);
6150       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6151
6152       /* Use the compare to choose either values from the data vector or
6153          zero.  */
6154       tree vec_cond = make_ssa_name (vectype);
6155       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6156                                                    vec_compare,
6157                                                    reduc_inputs[0],
6158                                                    zero_vec);
6159       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6160
6161       /* Finally we need to extract the data value from the vector (VEC_COND)
6162          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
6163          reduction, but because this doesn't exist, we can use a MAX reduction
6164          instead.  The data value might be signed or a float so we need to cast
6165          it first.
6166          In the case where the loop never made any matches, the data values are
6167          all identical, and so will reduce down correctly.  */
6168
6169       /* Make the matched data values unsigned.  */
6170       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6171       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6172                                        vec_cond);
6173       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6174                                                         VIEW_CONVERT_EXPR,
6175                                                         vec_cond_cast_rhs);
6176       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6177
6178       /* Reduce down to a scalar value.  */
6179       tree data_reduc = make_ssa_name (scalar_type_unsigned);
6180       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6181                                                            1, vec_cond_cast);
6182       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6183       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6184
6185       /* Convert the reduced value back to the result type and set as the
6186          result.  */
6187       gimple_seq stmts = NULL;
6188       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6189                                data_reduc);
6190       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6191       scalar_results.safe_push (new_temp);
6192     }
6193   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6194            && reduc_fn == IFN_LAST)
6195     {
6196       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
6197          idx = 0;
6198          idx_val = induction_index[0];
6199          val = data_reduc[0];
6200          for (idx = 0, val = init, i = 0; i < nelts; ++i)
6201            if (induction_index[i] > idx_val)
6202              val = data_reduc[i], idx_val = induction_index[i];
6203          return val;  */
6204
6205       tree data_eltype = TREE_TYPE (vectype);
6206       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6207       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6208       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6209       /* Enforced by vectorizable_reduction, which ensures we have target
6210          support before allowing a conditional reduction on variable-length
6211          vectors.  */
6212       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6213       tree idx_val = NULL_TREE, val = NULL_TREE;
6214       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6215         {
6216           tree old_idx_val = idx_val;
6217           tree old_val = val;
6218           idx_val = make_ssa_name (idx_eltype);
6219           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6220                                              build3 (BIT_FIELD_REF, idx_eltype,
6221                                                      induction_index,
6222                                                      bitsize_int (el_size),
6223                                                      bitsize_int (off)));
6224           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6225           val = make_ssa_name (data_eltype);
6226           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6227                                              build3 (BIT_FIELD_REF,
6228                                                      data_eltype,
6229                                                      reduc_inputs[0],
6230                                                      bitsize_int (el_size),
6231                                                      bitsize_int (off)));
6232           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6233           if (off != 0)
6234             {
6235               tree new_idx_val = idx_val;
6236               if (off != v_size - el_size)
6237                 {
6238                   new_idx_val = make_ssa_name (idx_eltype);
6239                   epilog_stmt = gimple_build_assign (new_idx_val,
6240                                                      MAX_EXPR, idx_val,
6241                                                      old_idx_val);
6242                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6243                 }
6244               tree cond = make_ssa_name (boolean_type_node);
6245               epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6246                                                  idx_val, old_idx_val);
6247               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6248               tree new_val = make_ssa_name (data_eltype);
6249               epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6250                                                  cond, val, old_val);
6251               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6252               idx_val = new_idx_val;
6253               val = new_val;
6254             }
6255         }
6256       /* Convert the reduced value back to the result type and set as the
6257          result.  */
6258       gimple_seq stmts = NULL;
6259       val = gimple_convert (&stmts, scalar_type, val);
6260       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6261       scalar_results.safe_push (val);
6262     }
6263
6264   /* 2.3 Create the reduction code, using one of the three schemes described
6265          above. In SLP we simply need to extract all the elements from the
6266          vector (without reducing them), so we use scalar shifts.  */
6267   else if (reduc_fn != IFN_LAST && !slp_reduc)
6268     {
6269       tree tmp;
6270       tree vec_elem_type;
6271
6272       /* Case 1:  Create:
6273          v_out2 = reduc_expr <v_out1>  */
6274
6275       if (dump_enabled_p ())
6276         dump_printf_loc (MSG_NOTE, vect_location,
6277                          "Reduce using direct vector reduction.\n");
6278
6279       gimple_seq stmts = NULL;
6280       vec_elem_type = TREE_TYPE (vectype);
6281       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6282                                vec_elem_type, reduc_inputs[0]);
6283       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6284       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6285
6286       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6287           && induc_val)
6288         {
6289           /* Earlier we set the initial value to be a vector if induc_val
6290              values.  Check the result and if it is induc_val then replace
6291              with the original initial value, unless induc_val is
6292              the same as initial_def already.  */
6293           tree zcompare = make_ssa_name (boolean_type_node);
6294           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6295                                              new_temp, induc_val);
6296           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6297           tree initial_def = reduc_info->reduc_initial_values[0];
6298           tmp = make_ssa_name (new_scalar_dest);
6299           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6300                                              initial_def, new_temp);
6301           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6302           new_temp = tmp;
6303         }
6304
6305       scalar_results.safe_push (new_temp);
6306     }
6307   else if (direct_slp_reduc)
6308     {
6309       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6310          with the elements for other SLP statements replaced with the
6311          neutral value.  We can then do a normal reduction on each vector.  */
6312
6313       /* Enforced by vectorizable_reduction.  */
6314       gcc_assert (reduc_inputs.length () == 1);
6315       gcc_assert (pow2p_hwi (group_size));
6316
6317       gimple_seq seq = NULL;
6318
6319       /* Build a vector {0, 1, 2, ...}, with the same number of elements
6320          and the same element size as VECTYPE.  */
6321       tree index = build_index_vector (vectype, 0, 1);
6322       tree index_type = TREE_TYPE (index);
6323       tree index_elt_type = TREE_TYPE (index_type);
6324       tree mask_type = truth_type_for (index_type);
6325
6326       /* Create a vector that, for each element, identifies which of
6327          the REDUC_GROUP_SIZE results should use it.  */
6328       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6329       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6330                             build_vector_from_val (index_type, index_mask));
6331
6332       /* Get a neutral vector value.  This is simply a splat of the neutral
6333          scalar value if we have one, otherwise the initial scalar value
6334          is itself a neutral value.  */
6335       tree vector_identity = NULL_TREE;
6336       tree neutral_op = NULL_TREE;
6337       if (slp_node)
6338         {
6339           tree initial_value = NULL_TREE;
6340           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6341             initial_value = reduc_info->reduc_initial_values[0];
6342           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6343                                                  initial_value);
6344         }
6345       if (neutral_op)
6346         vector_identity = gimple_build_vector_from_val (&seq, vectype,
6347                                                         neutral_op);
6348       for (unsigned int i = 0; i < group_size; ++i)
6349         {
6350           /* If there's no univeral neutral value, we can use the
6351              initial scalar value from the original PHI.  This is used
6352              for MIN and MAX reduction, for example.  */
6353           if (!neutral_op)
6354             {
6355               tree scalar_value = reduc_info->reduc_initial_values[i];
6356               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6357                                              scalar_value);
6358               vector_identity = gimple_build_vector_from_val (&seq, vectype,
6359                                                               scalar_value);
6360             }
6361
6362           /* Calculate the equivalent of:
6363
6364              sel[j] = (index[j] == i);
6365
6366              which selects the elements of REDUC_INPUTS[0] that should
6367              be included in the result.  */
6368           tree compare_val = build_int_cst (index_elt_type, i);
6369           compare_val = build_vector_from_val (index_type, compare_val);
6370           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6371                                    index, compare_val);
6372
6373           /* Calculate the equivalent of:
6374
6375              vec = seq ? reduc_inputs[0] : vector_identity;
6376
6377              VEC is now suitable for a full vector reduction.  */
6378           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6379                                    sel, reduc_inputs[0], vector_identity);
6380
6381           /* Do the reduction and convert it to the appropriate type.  */
6382           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6383                                       TREE_TYPE (vectype), vec);
6384           scalar = gimple_convert (&seq, scalar_type, scalar);
6385           scalar_results.safe_push (scalar);
6386         }
6387       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6388     }
6389   else
6390     {
6391       bool reduce_with_shift;
6392       tree vec_temp;
6393
6394       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6395
6396       /* See if the target wants to do the final (shift) reduction
6397          in a vector mode of smaller size and first reduce upper/lower
6398          halves against each other.  */
6399       enum machine_mode mode1 = mode;
6400       tree stype = TREE_TYPE (vectype);
6401       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6402       unsigned nunits1 = nunits;
6403       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6404           && reduc_inputs.length () == 1)
6405         {
6406           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6407           /* For SLP reductions we have to make sure lanes match up, but
6408              since we're doing individual element final reduction reducing
6409              vector width here is even more important.
6410              ???  We can also separate lanes with permutes, for the common
6411              case of power-of-two group-size odd/even extracts would work.  */
6412           if (slp_reduc && nunits != nunits1)
6413             {
6414               nunits1 = least_common_multiple (nunits1, group_size);
6415               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6416             }
6417         }
6418       if (!slp_reduc
6419           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6420         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6421
6422       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6423                                                            stype, nunits1);
6424       reduce_with_shift = have_whole_vector_shift (mode1);
6425       if (!VECTOR_MODE_P (mode1)
6426           || !directly_supported_p (code, vectype1))
6427         reduce_with_shift = false;
6428
6429       /* First reduce the vector to the desired vector size we should
6430          do shift reduction on by combining upper and lower halves.  */
6431       gimple_seq stmts = NULL;
6432       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6433                                              code, &stmts);
6434       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6435       reduc_inputs[0] = new_temp;
6436
6437       if (reduce_with_shift && !slp_reduc)
6438         {
6439           int element_bitsize = tree_to_uhwi (bitsize);
6440           /* Enforced by vectorizable_reduction, which disallows SLP reductions
6441              for variable-length vectors and also requires direct target support
6442              for loop reductions.  */
6443           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6444           int nelements = vec_size_in_bits / element_bitsize;
6445           vec_perm_builder sel;
6446           vec_perm_indices indices;
6447
6448           int elt_offset;
6449
6450           tree zero_vec = build_zero_cst (vectype1);
6451           /* Case 2: Create:
6452              for (offset = nelements/2; offset >= 1; offset/=2)
6453                 {
6454                   Create:  va' = vec_shift <va, offset>
6455                   Create:  va = vop <va, va'>
6456                 }  */
6457
6458           tree rhs;
6459
6460           if (dump_enabled_p ())
6461             dump_printf_loc (MSG_NOTE, vect_location,
6462                              "Reduce using vector shifts\n");
6463
6464           gimple_seq stmts = NULL;
6465           new_temp = gimple_convert (&stmts, vectype1, new_temp);
6466           for (elt_offset = nelements / 2;
6467                elt_offset >= 1;
6468                elt_offset /= 2)
6469             {
6470               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6471               indices.new_vector (sel, 2, nelements);
6472               tree mask = vect_gen_perm_mask_any (vectype1, indices);
6473               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6474                                        new_temp, zero_vec, mask);
6475               new_temp = gimple_build (&stmts, code,
6476                                        vectype1, new_name, new_temp);
6477             }
6478           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6479
6480           /* 2.4  Extract the final scalar result.  Create:
6481              s_out3 = extract_field <v_out2, bitpos>  */
6482
6483           if (dump_enabled_p ())
6484             dump_printf_loc (MSG_NOTE, vect_location,
6485                              "extract scalar result\n");
6486
6487           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6488                         bitsize, bitsize_zero_node);
6489           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6490           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6491           gimple_assign_set_lhs (epilog_stmt, new_temp);
6492           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6493           scalar_results.safe_push (new_temp);
6494         }
6495       else
6496         {
6497           /* Case 3: Create:
6498              s = extract_field <v_out2, 0>
6499              for (offset = element_size;
6500                   offset < vector_size;
6501                   offset += element_size;)
6502                {
6503                  Create:  s' = extract_field <v_out2, offset>
6504                  Create:  s = op <s, s'>  // For non SLP cases
6505                }  */
6506
6507           if (dump_enabled_p ())
6508             dump_printf_loc (MSG_NOTE, vect_location,
6509                              "Reduce using scalar code.\n");
6510
6511           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6512           int element_bitsize = tree_to_uhwi (bitsize);
6513           tree compute_type = TREE_TYPE (vectype);
6514           gimple_seq stmts = NULL;
6515           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6516             {
6517               int bit_offset;
6518               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6519                                        vec_temp, bitsize, bitsize_zero_node);
6520
6521               /* In SLP we don't need to apply reduction operation, so we just
6522                  collect s' values in SCALAR_RESULTS.  */
6523               if (slp_reduc)
6524                 scalar_results.safe_push (new_temp);
6525
6526               for (bit_offset = element_bitsize;
6527                    bit_offset < vec_size_in_bits;
6528                    bit_offset += element_bitsize)
6529                 {
6530                   tree bitpos = bitsize_int (bit_offset);
6531                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
6532                                            compute_type, vec_temp,
6533                                            bitsize, bitpos);
6534                   if (slp_reduc)
6535                     {
6536                       /* In SLP we don't need to apply reduction operation, so
6537                          we just collect s' values in SCALAR_RESULTS.  */
6538                       new_temp = new_name;
6539                       scalar_results.safe_push (new_name);
6540                     }
6541                   else
6542                     new_temp = gimple_build (&stmts, code, compute_type,
6543                                              new_name, new_temp);
6544                 }
6545             }
6546
6547           /* The only case where we need to reduce scalar results in SLP, is
6548              unrolling.  If the size of SCALAR_RESULTS is greater than
6549              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6550              REDUC_GROUP_SIZE.  */
6551           if (slp_reduc)
6552             {
6553               tree res, first_res, new_res;
6554
6555               /* Reduce multiple scalar results in case of SLP unrolling.  */
6556               for (j = group_size; scalar_results.iterate (j, &res);
6557                    j++)
6558                 {
6559                   first_res = scalar_results[j % group_size];
6560                   new_res = gimple_build (&stmts, code, compute_type,
6561                                           first_res, res);
6562                   scalar_results[j % group_size] = new_res;
6563                 }
6564               scalar_results.truncate (group_size);
6565               for (k = 0; k < group_size; k++)
6566                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6567                                                     scalar_results[k]);
6568             }
6569           else
6570             {
6571               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6572               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6573               scalar_results.safe_push (new_temp);
6574             }
6575
6576           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6577         }
6578
6579       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6580           && induc_val)
6581         {
6582           /* Earlier we set the initial value to be a vector if induc_val
6583              values.  Check the result and if it is induc_val then replace
6584              with the original initial value, unless induc_val is
6585              the same as initial_def already.  */
6586           tree zcompare = make_ssa_name (boolean_type_node);
6587           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6588                                              induc_val);
6589           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6590           tree initial_def = reduc_info->reduc_initial_values[0];
6591           tree tmp = make_ssa_name (new_scalar_dest);
6592           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6593                                              initial_def, new_temp);
6594           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6595           scalar_results[0] = tmp;
6596         }
6597     }
6598
6599   /* 2.5 Adjust the final result by the initial value of the reduction
6600          variable. (When such adjustment is not needed, then
6601          'adjustment_def' is zero).  For example, if code is PLUS we create:
6602          new_temp = loop_exit_def + adjustment_def  */
6603
6604   if (adjustment_def)
6605     {
6606       gcc_assert (!slp_reduc);
6607       gimple_seq stmts = NULL;
6608       if (double_reduc)
6609         {
6610           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6611           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6612           new_temp = gimple_build (&stmts, code, vectype,
6613                                    reduc_inputs[0], adjustment_def);
6614         }
6615       else
6616         {
6617           new_temp = scalar_results[0];
6618           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6619           adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6620                                            adjustment_def);
6621           new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6622           new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6623                                    new_temp, adjustment_def);
6624           new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6625         }
6626
6627       epilog_stmt = gimple_seq_last_stmt (stmts);
6628       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6629       scalar_results[0] = new_temp;
6630     }
6631
6632   /* Record this operation if it could be reused by the epilogue loop.  */
6633   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6634       && reduc_inputs.length () == 1)
6635     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6636                                            { orig_reduc_input, reduc_info });
6637
6638   if (double_reduc)
6639     loop = outer_loop;
6640
6641   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6642           phis with new adjusted scalar results, i.e., replace use <s_out0>
6643           with use <s_out4>.
6644
6645      Transform:
6646         loop_exit:
6647           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6648           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6649           v_out2 = reduce <v_out1>
6650           s_out3 = extract_field <v_out2, 0>
6651           s_out4 = adjust_result <s_out3>
6652           use <s_out0>
6653           use <s_out0>
6654
6655      into:
6656
6657         loop_exit:
6658           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6659           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6660           v_out2 = reduce <v_out1>
6661           s_out3 = extract_field <v_out2, 0>
6662           s_out4 = adjust_result <s_out3>
6663           use <s_out4>
6664           use <s_out4> */
6665
6666   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6667   for (k = 0; k < live_out_stmts.size (); k++)
6668     {
6669       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6670       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6671
6672       phis.create (3);
6673       /* Find the loop-closed-use at the loop exit of the original scalar
6674          result.  (The reduction result is expected to have two immediate uses,
6675          one at the latch block, and one at the loop exit).  For double
6676          reductions we are looking for exit phis of the outer loop.  */
6677       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6678         {
6679           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6680             {
6681               if (!is_gimple_debug (USE_STMT (use_p)))
6682                 phis.safe_push (USE_STMT (use_p));
6683             }
6684           else
6685             {
6686               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6687                 {
6688                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6689
6690                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6691                     {
6692                       if (!flow_bb_inside_loop_p (loop,
6693                                              gimple_bb (USE_STMT (phi_use_p)))
6694                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6695                         phis.safe_push (USE_STMT (phi_use_p));
6696                     }
6697                 }
6698             }
6699         }
6700
6701       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6702         {
6703           /* Replace the uses:  */
6704           orig_name = PHI_RESULT (exit_phi);
6705
6706           /* Look for a single use at the target of the skip edge.  */
6707           if (unify_with_main_loop_p)
6708             {
6709               use_operand_p use_p;
6710               gimple *user;
6711               if (!single_imm_use (orig_name, &use_p, &user))
6712                 gcc_unreachable ();
6713               orig_name = gimple_get_lhs (user);
6714             }
6715
6716           scalar_result = scalar_results[k];
6717           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6718             {
6719               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6720                 SET_USE (use_p, scalar_result);
6721               update_stmt (use_stmt);
6722             }
6723         }
6724
6725       phis.release ();
6726     }
6727 }
6728
6729 /* Return a vector of type VECTYPE that is equal to the vector select
6730    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6731    before GSI.  */
6732
6733 static tree
6734 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6735                      tree vec, tree identity)
6736 {
6737   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6738   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6739                                           mask, vec, identity);
6740   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6741   return cond;
6742 }
6743
6744 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6745    order, starting with LHS.  Insert the extraction statements before GSI and
6746    associate the new scalar SSA names with variable SCALAR_DEST.
6747    Return the SSA name for the result.  */
6748
6749 static tree
6750 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6751                        tree_code code, tree lhs, tree vector_rhs)
6752 {
6753   tree vectype = TREE_TYPE (vector_rhs);
6754   tree scalar_type = TREE_TYPE (vectype);
6755   tree bitsize = TYPE_SIZE (scalar_type);
6756   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6757   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6758
6759   for (unsigned HOST_WIDE_INT bit_offset = 0;
6760        bit_offset < vec_size_in_bits;
6761        bit_offset += element_bitsize)
6762     {
6763       tree bitpos = bitsize_int (bit_offset);
6764       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6765                          bitsize, bitpos);
6766
6767       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6768       rhs = make_ssa_name (scalar_dest, stmt);
6769       gimple_assign_set_lhs (stmt, rhs);
6770       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6771
6772       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6773       tree new_name = make_ssa_name (scalar_dest, stmt);
6774       gimple_assign_set_lhs (stmt, new_name);
6775       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6776       lhs = new_name;
6777     }
6778   return lhs;
6779 }
6780
6781 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6782    type of the vector input.  */
6783
6784 static internal_fn
6785 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6786 {
6787   internal_fn mask_reduc_fn;
6788
6789   switch (reduc_fn)
6790     {
6791     case IFN_FOLD_LEFT_PLUS:
6792       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6793       break;
6794
6795     default:
6796       return IFN_LAST;
6797     }
6798
6799   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6800                                       OPTIMIZE_FOR_SPEED))
6801     return mask_reduc_fn;
6802   return IFN_LAST;
6803 }
6804
6805 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6806    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6807    statement.  CODE is the operation performed by STMT_INFO and OPS are
6808    its scalar operands.  REDUC_INDEX is the index of the operand in
6809    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6810    implements in-order reduction, or IFN_LAST if we should open-code it.
6811    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6812    that should be used to control the operation in a fully-masked loop.  */
6813
6814 static bool
6815 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6816                                stmt_vec_info stmt_info,
6817                                gimple_stmt_iterator *gsi,
6818                                gimple **vec_stmt, slp_tree slp_node,
6819                                gimple *reduc_def_stmt,
6820                                tree_code code, internal_fn reduc_fn,
6821                                tree ops[3], tree vectype_in,
6822                                int reduc_index, vec_loop_masks *masks)
6823 {
6824   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6825   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6826   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6827
6828   int ncopies;
6829   if (slp_node)
6830     ncopies = 1;
6831   else
6832     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6833
6834   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6835   gcc_assert (ncopies == 1);
6836   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6837
6838   if (slp_node)
6839     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6840                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6841
6842   tree op0 = ops[1 - reduc_index];
6843
6844   int group_size = 1;
6845   stmt_vec_info scalar_dest_def_info;
6846   auto_vec<tree> vec_oprnds0;
6847   if (slp_node)
6848     {
6849       auto_vec<vec<tree> > vec_defs (2);
6850       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6851       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6852       vec_defs[0].release ();
6853       vec_defs[1].release ();
6854       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6855       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6856     }
6857   else
6858     {
6859       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6860                                      op0, &vec_oprnds0);
6861       scalar_dest_def_info = stmt_info;
6862     }
6863
6864   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6865   tree scalar_type = TREE_TYPE (scalar_dest);
6866   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6867
6868   int vec_num = vec_oprnds0.length ();
6869   gcc_assert (vec_num == 1 || slp_node);
6870   tree vec_elem_type = TREE_TYPE (vectype_out);
6871   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6872
6873   tree vector_identity = NULL_TREE;
6874   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6875     vector_identity = build_zero_cst (vectype_out);
6876
6877   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6878   int i;
6879   tree def0;
6880   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6881     {
6882       gimple *new_stmt;
6883       tree mask = NULL_TREE;
6884       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6885         mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
6886
6887       /* Handle MINUS by adding the negative.  */
6888       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6889         {
6890           tree negated = make_ssa_name (vectype_out);
6891           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6892           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6893           def0 = negated;
6894         }
6895
6896       if (mask && mask_reduc_fn == IFN_LAST)
6897         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6898                                     vector_identity);
6899
6900       /* On the first iteration the input is simply the scalar phi
6901          result, and for subsequent iterations it is the output of
6902          the preceding operation.  */
6903       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6904         {
6905           if (mask && mask_reduc_fn != IFN_LAST)
6906             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6907                                                    def0, mask);
6908           else
6909             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6910                                                    def0);
6911           /* For chained SLP reductions the output of the previous reduction
6912              operation serves as the input of the next. For the final statement
6913              the output cannot be a temporary - we reuse the original
6914              scalar destination of the last statement.  */
6915           if (i != vec_num - 1)
6916             {
6917               gimple_set_lhs (new_stmt, scalar_dest_var);
6918               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6919               gimple_set_lhs (new_stmt, reduc_var);
6920             }
6921         }
6922       else
6923         {
6924           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6925                                              reduc_var, def0);
6926           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6927           /* Remove the statement, so that we can use the same code paths
6928              as for statements that we've just created.  */
6929           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6930           gsi_remove (&tmp_gsi, true);
6931         }
6932
6933       if (i == vec_num - 1)
6934         {
6935           gimple_set_lhs (new_stmt, scalar_dest);
6936           vect_finish_replace_stmt (loop_vinfo,
6937                                     scalar_dest_def_info,
6938                                     new_stmt);
6939         }
6940       else
6941         vect_finish_stmt_generation (loop_vinfo,
6942                                      scalar_dest_def_info,
6943                                      new_stmt, gsi);
6944
6945       if (slp_node)
6946         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6947       else
6948         {
6949           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6950           *vec_stmt = new_stmt;
6951         }
6952     }
6953
6954   return true;
6955 }
6956
6957 /* Function is_nonwrapping_integer_induction.
6958
6959    Check if STMT_VINO (which is part of loop LOOP) both increments and
6960    does not cause overflow.  */
6961
6962 static bool
6963 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6964 {
6965   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6966   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6967   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6968   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6969   widest_int ni, max_loop_value, lhs_max;
6970   wi::overflow_type overflow = wi::OVF_NONE;
6971
6972   /* Make sure the loop is integer based.  */
6973   if (TREE_CODE (base) != INTEGER_CST
6974       || TREE_CODE (step) != INTEGER_CST)
6975     return false;
6976
6977   /* Check that the max size of the loop will not wrap.  */
6978
6979   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6980     return true;
6981
6982   if (! max_stmt_executions (loop, &ni))
6983     return false;
6984
6985   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6986                             &overflow);
6987   if (overflow)
6988     return false;
6989
6990   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6991                             TYPE_SIGN (lhs_type), &overflow);
6992   if (overflow)
6993     return false;
6994
6995   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6996           <= TYPE_PRECISION (lhs_type));
6997 }
6998
6999 /* Check if masking can be supported by inserting a conditional expression.
7000    CODE is the code for the operation.  COND_FN is the conditional internal
7001    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
7002 static bool
7003 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7004                          tree vectype_in)
7005 {
7006   if (cond_fn != IFN_LAST
7007       && direct_internal_fn_supported_p (cond_fn, vectype_in,
7008                                          OPTIMIZE_FOR_SPEED))
7009     return false;
7010
7011   if (code.is_tree_code ())
7012     switch (tree_code (code))
7013       {
7014       case DOT_PROD_EXPR:
7015       case SAD_EXPR:
7016         return true;
7017
7018       default:
7019         break;
7020       }
7021   return false;
7022 }
7023
7024 /* Insert a conditional expression to enable masked vectorization.  CODE is the
7025    code for the operation.  VOP is the array of operands.  MASK is the loop
7026    mask.  GSI is a statement iterator used to place the new conditional
7027    expression.  */
7028 static void
7029 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7030                       gimple_stmt_iterator *gsi)
7031 {
7032   switch (tree_code (code))
7033     {
7034     case DOT_PROD_EXPR:
7035       {
7036         tree vectype = TREE_TYPE (vop[1]);
7037         tree zero = build_zero_cst (vectype);
7038         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7039         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7040                                                mask, vop[1], zero);
7041         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7042         vop[1] = masked_op1;
7043         break;
7044       }
7045
7046     case SAD_EXPR:
7047       {
7048         tree vectype = TREE_TYPE (vop[1]);
7049         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7050         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7051                                                mask, vop[1], vop[0]);
7052         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7053         vop[1] = masked_op1;
7054         break;
7055       }
7056
7057     default:
7058       gcc_unreachable ();
7059     }
7060 }
7061
7062 /* Function vectorizable_reduction.
7063
7064    Check if STMT_INFO performs a reduction operation that can be vectorized.
7065    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7066    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7067    Return true if STMT_INFO is vectorizable in this way.
7068
7069    This function also handles reduction idioms (patterns) that have been
7070    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
7071    may be of this form:
7072      X = pattern_expr (arg0, arg1, ..., X)
7073    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7074    sequence that had been detected and replaced by the pattern-stmt
7075    (STMT_INFO).
7076
7077    This function also handles reduction of condition expressions, for example:
7078      for (int i = 0; i < N; i++)
7079        if (a[i] < value)
7080          last = a[i];
7081    This is handled by vectorising the loop and creating an additional vector
7082    containing the loop indexes for which "a[i] < value" was true.  In the
7083    function epilogue this is reduced to a single max value and then used to
7084    index into the vector of results.
7085
7086    In some cases of reduction patterns, the type of the reduction variable X is
7087    different than the type of the other arguments of STMT_INFO.
7088    In such cases, the vectype that is used when transforming STMT_INFO into
7089    a vector stmt is different than the vectype that is used to determine the
7090    vectorization factor, because it consists of a different number of elements
7091    than the actual number of elements that are being operated upon in parallel.
7092
7093    For example, consider an accumulation of shorts into an int accumulator.
7094    On some targets it's possible to vectorize this pattern operating on 8
7095    shorts at a time (hence, the vectype for purposes of determining the
7096    vectorization factor should be V8HI); on the other hand, the vectype that
7097    is used to create the vector form is actually V4SI (the type of the result).
7098
7099    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7100    indicates what is the actual level of parallelism (V8HI in the example), so
7101    that the right vectorization factor would be derived.  This vectype
7102    corresponds to the type of arguments to the reduction stmt, and should *NOT*
7103    be used to create the vectorized stmt.  The right vectype for the vectorized
7104    stmt is obtained from the type of the result X:
7105       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7106
7107    This means that, contrary to "regular" reductions (or "regular" stmts in
7108    general), the following equation:
7109       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7110    does *NOT* necessarily hold for reduction patterns.  */
7111
7112 bool
7113 vectorizable_reduction (loop_vec_info loop_vinfo,
7114                         stmt_vec_info stmt_info, slp_tree slp_node,
7115                         slp_instance slp_node_instance,
7116                         stmt_vector_for_cost *cost_vec)
7117 {
7118   tree vectype_in = NULL_TREE;
7119   tree vectype_op[3] = { NULL_TREE, NULL_TREE, NULL_TREE };
7120   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7121   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7122   stmt_vec_info cond_stmt_vinfo = NULL;
7123   int i;
7124   int ncopies;
7125   bool single_defuse_cycle = false;
7126   bool nested_cycle = false;
7127   bool double_reduc = false;
7128   int vec_num;
7129   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7130   tree cond_reduc_val = NULL_TREE;
7131
7132   /* Make sure it was already recognized as a reduction computation.  */
7133   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7134       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7135       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7136     return false;
7137
7138   /* The stmt we store reduction analysis meta on.  */
7139   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7140   reduc_info->is_reduc_info = true;
7141
7142   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7143     {
7144       if (is_a <gphi *> (stmt_info->stmt))
7145         {
7146           if (slp_node)
7147             {
7148               /* We eventually need to set a vector type on invariant
7149                  arguments.  */
7150               unsigned j;
7151               slp_tree child;
7152               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7153                 if (!vect_maybe_update_slp_op_vectype
7154                        (child, SLP_TREE_VECTYPE (slp_node)))
7155                   {
7156                     if (dump_enabled_p ())
7157                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7158                                        "incompatible vector types for "
7159                                        "invariants\n");
7160                     return false;
7161                   }
7162             }
7163           /* Analysis for double-reduction is done on the outer
7164              loop PHI, nested cycles have no further restrictions.  */
7165           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7166         }
7167       else
7168         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7169       return true;
7170     }
7171
7172   stmt_vec_info orig_stmt_of_analysis = stmt_info;
7173   stmt_vec_info phi_info = stmt_info;
7174   if (!is_a <gphi *> (stmt_info->stmt))
7175     {
7176       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7177       return true;
7178     }
7179   if (slp_node)
7180     {
7181       slp_node_instance->reduc_phis = slp_node;
7182       /* ???  We're leaving slp_node to point to the PHIs, we only
7183          need it to get at the number of vector stmts which wasn't
7184          yet initialized for the instance root.  */
7185     }
7186   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7187     {
7188       use_operand_p use_p;
7189       gimple *use_stmt;
7190       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7191                                  &use_p, &use_stmt);
7192       gcc_assert (res);
7193       phi_info = loop_vinfo->lookup_stmt (use_stmt);
7194     }
7195
7196   /* PHIs should not participate in patterns.  */
7197   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7198   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7199
7200   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7201      and compute the reduction chain length.  Discover the real
7202      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
7203   tree reduc_def
7204     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7205                              loop_latch_edge
7206                                (gimple_bb (reduc_def_phi)->loop_father));
7207   unsigned reduc_chain_length = 0;
7208   bool only_slp_reduc_chain = true;
7209   stmt_info = NULL;
7210   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7211   while (reduc_def != PHI_RESULT (reduc_def_phi))
7212     {
7213       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7214       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7215       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7216         {
7217           if (dump_enabled_p ())
7218             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7219                              "reduction chain broken by patterns.\n");
7220           return false;
7221         }
7222       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7223         only_slp_reduc_chain = false;
7224       /* For epilogue generation live members of the chain need
7225          to point back to the PHI via their original stmt for
7226          info_for_reduction to work.  For SLP we need to look at
7227          all lanes here - even though we only will vectorize from
7228          the SLP node with live lane zero the other live lanes also
7229          need to be identified as part of a reduction to be able
7230          to skip code generation for them.  */
7231       if (slp_for_stmt_info)
7232         {
7233           for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7234             if (STMT_VINFO_LIVE_P (s))
7235               STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7236         }
7237       else if (STMT_VINFO_LIVE_P (vdef))
7238         STMT_VINFO_REDUC_DEF (def) = phi_info;
7239       gimple_match_op op;
7240       if (!gimple_extract_op (vdef->stmt, &op))
7241         {
7242           if (dump_enabled_p ())
7243             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7244                              "reduction chain includes unsupported"
7245                              " statement type.\n");
7246           return false;
7247         }
7248       if (CONVERT_EXPR_CODE_P (op.code))
7249         {
7250           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7251             {
7252               if (dump_enabled_p ())
7253                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7254                                  "conversion in the reduction chain.\n");
7255               return false;
7256             }
7257         }
7258       else if (!stmt_info)
7259         /* First non-conversion stmt.  */
7260         stmt_info = vdef;
7261       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7262       reduc_chain_length++;
7263       if (!stmt_info && slp_node)
7264         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7265     }
7266   /* PHIs should not participate in patterns.  */
7267   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7268
7269   if (nested_in_vect_loop_p (loop, stmt_info))
7270     {
7271       loop = loop->inner;
7272       nested_cycle = true;
7273     }
7274
7275   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7276      element.  */
7277   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7278     {
7279       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7280       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7281     }
7282   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7283     gcc_assert (slp_node
7284                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7285
7286   /* 1. Is vectorizable reduction?  */
7287   /* Not supportable if the reduction variable is used in the loop, unless
7288      it's a reduction chain.  */
7289   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7290       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7291     return false;
7292
7293   /* Reductions that are not used even in an enclosing outer-loop,
7294      are expected to be "live" (used out of the loop).  */
7295   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7296       && !STMT_VINFO_LIVE_P (stmt_info))
7297     return false;
7298
7299   /* 2. Has this been recognized as a reduction pattern?
7300
7301      Check if STMT represents a pattern that has been recognized
7302      in earlier analysis stages.  For stmts that represent a pattern,
7303      the STMT_VINFO_RELATED_STMT field records the last stmt in
7304      the original sequence that constitutes the pattern.  */
7305
7306   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7307   if (orig_stmt_info)
7308     {
7309       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7310       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7311     }
7312
7313   /* 3. Check the operands of the operation.  The first operands are defined
7314         inside the loop body. The last operand is the reduction variable,
7315         which is defined by the loop-header-phi.  */
7316
7317   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7318   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7319   gimple_match_op op;
7320   if (!gimple_extract_op (stmt_info->stmt, &op))
7321     gcc_unreachable ();
7322   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7323                             || op.code == WIDEN_SUM_EXPR
7324                             || op.code == SAD_EXPR);
7325
7326   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7327       && !SCALAR_FLOAT_TYPE_P (op.type))
7328     return false;
7329
7330   /* Do not try to vectorize bit-precision reductions.  */
7331   if (!type_has_mode_precision_p (op.type))
7332     return false;
7333
7334   /* For lane-reducing ops we're reducing the number of reduction PHIs
7335      which means the only use of that may be in the lane-reducing operation.  */
7336   if (lane_reduc_code_p
7337       && reduc_chain_length != 1
7338       && !only_slp_reduc_chain)
7339     {
7340       if (dump_enabled_p ())
7341         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7342                          "lane-reducing reduction with extra stmts.\n");
7343       return false;
7344     }
7345
7346   /* All uses but the last are expected to be defined in the loop.
7347      The last use is the reduction variable.  In case of nested cycle this
7348      assumption is not true: we use reduc_index to record the index of the
7349      reduction variable.  */
7350   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7351   /* We need to skip an extra operand for COND_EXPRs with embedded
7352      comparison.  */
7353   unsigned opno_adjust = 0;
7354   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7355     opno_adjust = 1;
7356   for (i = 0; i < (int) op.num_ops; i++)
7357     {
7358       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
7359       if (i == 0 && op.code == COND_EXPR)
7360         continue;
7361
7362       stmt_vec_info def_stmt_info;
7363       enum vect_def_type dt;
7364       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7365                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7366                                &vectype_op[i], &def_stmt_info))
7367         {
7368           if (dump_enabled_p ())
7369             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7370                              "use not simple.\n");
7371           return false;
7372         }
7373       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7374         continue;
7375
7376       /* There should be only one cycle def in the stmt, the one
7377          leading to reduc_def.  */
7378       if (VECTORIZABLE_CYCLE_DEF (dt))
7379         return false;
7380
7381       if (!vectype_op[i])
7382         vectype_op[i]
7383           = get_vectype_for_scalar_type (loop_vinfo,
7384                                          TREE_TYPE (op.ops[i]), slp_op[i]);
7385
7386       /* To properly compute ncopies we are interested in the widest
7387          non-reduction input type in case we're looking at a widening
7388          accumulation that we later handle in vect_transform_reduction.  */
7389       if (lane_reduc_code_p
7390           && vectype_op[i]
7391           && (!vectype_in
7392               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7393                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7394         vectype_in = vectype_op[i];
7395
7396       if (op.code == COND_EXPR)
7397         {
7398           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
7399           if (dt == vect_constant_def)
7400             {
7401               cond_reduc_dt = dt;
7402               cond_reduc_val = op.ops[i];
7403             }
7404           if (dt == vect_induction_def
7405               && def_stmt_info
7406               && is_nonwrapping_integer_induction (def_stmt_info, loop))
7407             {
7408               cond_reduc_dt = dt;
7409               cond_stmt_vinfo = def_stmt_info;
7410             }
7411         }
7412     }
7413   if (!vectype_in)
7414     vectype_in = STMT_VINFO_VECTYPE (phi_info);
7415   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7416
7417   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7418   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7419   /* If we have a condition reduction, see if we can simplify it further.  */
7420   if (v_reduc_type == COND_REDUCTION)
7421     {
7422       if (slp_node)
7423         return false;
7424
7425       /* When the condition uses the reduction value in the condition, fail.  */
7426       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7427         {
7428           if (dump_enabled_p ())
7429             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7430                              "condition depends on previous iteration\n");
7431           return false;
7432         }
7433
7434       if (reduc_chain_length == 1
7435           && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
7436                                              vectype_in, OPTIMIZE_FOR_SPEED))
7437         {
7438           if (dump_enabled_p ())
7439             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7440                              "optimizing condition reduction with"
7441                              " FOLD_EXTRACT_LAST.\n");
7442           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7443         }
7444       else if (cond_reduc_dt == vect_induction_def)
7445         {
7446           tree base
7447             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7448           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7449
7450           gcc_assert (TREE_CODE (base) == INTEGER_CST
7451                       && TREE_CODE (step) == INTEGER_CST);
7452           cond_reduc_val = NULL_TREE;
7453           enum tree_code cond_reduc_op_code = ERROR_MARK;
7454           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7455           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7456             ;
7457           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7458              above base; punt if base is the minimum value of the type for
7459              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
7460           else if (tree_int_cst_sgn (step) == -1)
7461             {
7462               cond_reduc_op_code = MIN_EXPR;
7463               if (tree_int_cst_sgn (base) == -1)
7464                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7465               else if (tree_int_cst_lt (base,
7466                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
7467                 cond_reduc_val
7468                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
7469             }
7470           else
7471             {
7472               cond_reduc_op_code = MAX_EXPR;
7473               if (tree_int_cst_sgn (base) == 1)
7474                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7475               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7476                                         base))
7477                 cond_reduc_val
7478                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
7479             }
7480           if (cond_reduc_val)
7481             {
7482               if (dump_enabled_p ())
7483                 dump_printf_loc (MSG_NOTE, vect_location,
7484                                  "condition expression based on "
7485                                  "integer induction.\n");
7486               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7487               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7488                 = cond_reduc_val;
7489               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7490             }
7491         }
7492       else if (cond_reduc_dt == vect_constant_def)
7493         {
7494           enum vect_def_type cond_initial_dt;
7495           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7496           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7497           if (cond_initial_dt == vect_constant_def
7498               && types_compatible_p (TREE_TYPE (cond_initial_val),
7499                                      TREE_TYPE (cond_reduc_val)))
7500             {
7501               tree e = fold_binary (LE_EXPR, boolean_type_node,
7502                                     cond_initial_val, cond_reduc_val);
7503               if (e && (integer_onep (e) || integer_zerop (e)))
7504                 {
7505                   if (dump_enabled_p ())
7506                     dump_printf_loc (MSG_NOTE, vect_location,
7507                                      "condition expression based on "
7508                                      "compile time constant.\n");
7509                   /* Record reduction code at analysis stage.  */
7510                   STMT_VINFO_REDUC_CODE (reduc_info)
7511                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7512                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7513                 }
7514             }
7515         }
7516     }
7517
7518   if (STMT_VINFO_LIVE_P (phi_info))
7519     return false;
7520
7521   if (slp_node)
7522     ncopies = 1;
7523   else
7524     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7525
7526   gcc_assert (ncopies >= 1);
7527
7528   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7529
7530   if (nested_cycle)
7531     {
7532       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7533                   == vect_double_reduction_def);
7534       double_reduc = true;
7535     }
7536
7537   /* 4.2. Check support for the epilog operation.
7538
7539           If STMT represents a reduction pattern, then the type of the
7540           reduction variable may be different than the type of the rest
7541           of the arguments.  For example, consider the case of accumulation
7542           of shorts into an int accumulator; The original code:
7543                         S1: int_a = (int) short_a;
7544           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7545
7546           was replaced with:
7547                         STMT: int_acc = widen_sum <short_a, int_acc>
7548
7549           This means that:
7550           1. The tree-code that is used to create the vector operation in the
7551              epilog code (that reduces the partial results) is not the
7552              tree-code of STMT, but is rather the tree-code of the original
7553              stmt from the pattern that STMT is replacing.  I.e, in the example
7554              above we want to use 'widen_sum' in the loop, but 'plus' in the
7555              epilog.
7556           2. The type (mode) we use to check available target support
7557              for the vector operation to be created in the *epilog*, is
7558              determined by the type of the reduction variable (in the example
7559              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7560              However the type (mode) we use to check available target support
7561              for the vector operation to be created *inside the loop*, is
7562              determined by the type of the other arguments to STMT (in the
7563              example we'd check this: optab_handler (widen_sum_optab,
7564              vect_short_mode)).
7565
7566           This is contrary to "regular" reductions, in which the types of all
7567           the arguments are the same as the type of the reduction variable.
7568           For "regular" reductions we can therefore use the same vector type
7569           (and also the same tree-code) when generating the epilog code and
7570           when generating the code inside the loop.  */
7571
7572   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7573   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7574
7575   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7576   if (reduction_type == TREE_CODE_REDUCTION)
7577     {
7578       /* Check whether it's ok to change the order of the computation.
7579          Generally, when vectorizing a reduction we change the order of the
7580          computation.  This may change the behavior of the program in some
7581          cases, so we need to check that this is ok.  One exception is when
7582          vectorizing an outer-loop: the inner-loop is executed sequentially,
7583          and therefore vectorizing reductions in the inner-loop during
7584          outer-loop vectorization is safe.  Likewise when we are vectorizing
7585          a series of reductions using SLP and the VF is one the reductions
7586          are performed in scalar order.  */
7587       if (slp_node
7588           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7589           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7590         ;
7591       else if (needs_fold_left_reduction_p (op.type, orig_code))
7592         {
7593           /* When vectorizing a reduction chain w/o SLP the reduction PHI
7594              is not directy used in stmt.  */
7595           if (!only_slp_reduc_chain
7596               && reduc_chain_length != 1)
7597             {
7598               if (dump_enabled_p ())
7599                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7600                                  "in-order reduction chain without SLP.\n");
7601               return false;
7602             }
7603           STMT_VINFO_REDUC_TYPE (reduc_info)
7604             = reduction_type = FOLD_LEFT_REDUCTION;
7605         }
7606       else if (!commutative_binary_op_p (orig_code, op.type)
7607                || !associative_binary_op_p (orig_code, op.type))
7608         {
7609           if (dump_enabled_p ())
7610             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7611                             "reduction: not commutative/associative");
7612           return false;
7613         }
7614     }
7615
7616   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7617       && ncopies > 1)
7618     {
7619       if (dump_enabled_p ())
7620         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7621                          "multiple types in double reduction or condition "
7622                          "reduction or fold-left reduction.\n");
7623       return false;
7624     }
7625
7626   internal_fn reduc_fn = IFN_LAST;
7627   if (reduction_type == TREE_CODE_REDUCTION
7628       || reduction_type == FOLD_LEFT_REDUCTION
7629       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7630       || reduction_type == CONST_COND_REDUCTION)
7631     {
7632       if (reduction_type == FOLD_LEFT_REDUCTION
7633           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7634           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7635         {
7636           if (reduc_fn != IFN_LAST
7637               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7638                                                   OPTIMIZE_FOR_SPEED))
7639             {
7640               if (dump_enabled_p ())
7641                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7642                                  "reduc op not supported by target.\n");
7643
7644               reduc_fn = IFN_LAST;
7645             }
7646         }
7647       else
7648         {
7649           if (!nested_cycle || double_reduc)
7650             {
7651               if (dump_enabled_p ())
7652                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7653                                  "no reduc code for scalar code.\n");
7654
7655               return false;
7656             }
7657         }
7658     }
7659   else if (reduction_type == COND_REDUCTION)
7660     {
7661       int scalar_precision
7662         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7663       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7664       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7665                                                 vectype_out);
7666
7667       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7668                                           OPTIMIZE_FOR_SPEED))
7669         reduc_fn = IFN_REDUC_MAX;
7670     }
7671   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7672
7673   if (reduction_type != EXTRACT_LAST_REDUCTION
7674       && (!nested_cycle || double_reduc)
7675       && reduc_fn == IFN_LAST
7676       && !nunits_out.is_constant ())
7677     {
7678       if (dump_enabled_p ())
7679         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7680                          "missing target support for reduction on"
7681                          " variable-length vectors.\n");
7682       return false;
7683     }
7684
7685   /* For SLP reductions, see if there is a neutral value we can use.  */
7686   tree neutral_op = NULL_TREE;
7687   if (slp_node)
7688     {
7689       tree initial_value = NULL_TREE;
7690       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7691         initial_value = vect_phi_initial_value (reduc_def_phi);
7692       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7693                                              orig_code, initial_value);
7694     }
7695
7696   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7697     {
7698       /* We can't support in-order reductions of code such as this:
7699
7700            for (int i = 0; i < n1; ++i)
7701              for (int j = 0; j < n2; ++j)
7702                l += a[j];
7703
7704          since GCC effectively transforms the loop when vectorizing:
7705
7706            for (int i = 0; i < n1 / VF; ++i)
7707              for (int j = 0; j < n2; ++j)
7708                for (int k = 0; k < VF; ++k)
7709                  l += a[j];
7710
7711          which is a reassociation of the original operation.  */
7712       if (dump_enabled_p ())
7713         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7714                          "in-order double reduction not supported.\n");
7715
7716       return false;
7717     }
7718
7719   if (reduction_type == FOLD_LEFT_REDUCTION
7720       && slp_node
7721       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7722     {
7723       /* We cannot use in-order reductions in this case because there is
7724          an implicit reassociation of the operations involved.  */
7725       if (dump_enabled_p ())
7726         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7727                          "in-order unchained SLP reductions not supported.\n");
7728       return false;
7729     }
7730
7731   /* For double reductions, and for SLP reductions with a neutral value,
7732      we construct a variable-length initial vector by loading a vector
7733      full of the neutral value and then shift-and-inserting the start
7734      values into the low-numbered elements.  */
7735   if ((double_reduc || neutral_op)
7736       && !nunits_out.is_constant ()
7737       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7738                                           vectype_out, OPTIMIZE_FOR_SPEED))
7739     {
7740       if (dump_enabled_p ())
7741         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7742                          "reduction on variable-length vectors requires"
7743                          " target support for a vector-shift-and-insert"
7744                          " operation.\n");
7745       return false;
7746     }
7747
7748   /* Check extra constraints for variable-length unchained SLP reductions.  */
7749   if (slp_node
7750       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7751       && !nunits_out.is_constant ())
7752     {
7753       /* We checked above that we could build the initial vector when
7754          there's a neutral element value.  Check here for the case in
7755          which each SLP statement has its own initial value and in which
7756          that value needs to be repeated for every instance of the
7757          statement within the initial vector.  */
7758       unsigned int group_size = SLP_TREE_LANES (slp_node);
7759       if (!neutral_op
7760           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7761                                               TREE_TYPE (vectype_out)))
7762         {
7763           if (dump_enabled_p ())
7764             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7765                              "unsupported form of SLP reduction for"
7766                              " variable-length vectors: cannot build"
7767                              " initial vector.\n");
7768           return false;
7769         }
7770       /* The epilogue code relies on the number of elements being a multiple
7771          of the group size.  The duplicate-and-interleave approach to setting
7772          up the initial vector does too.  */
7773       if (!multiple_p (nunits_out, group_size))
7774         {
7775           if (dump_enabled_p ())
7776             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7777                              "unsupported form of SLP reduction for"
7778                              " variable-length vectors: the vector size"
7779                              " is not a multiple of the number of results.\n");
7780           return false;
7781         }
7782     }
7783
7784   if (reduction_type == COND_REDUCTION)
7785     {
7786       widest_int ni;
7787
7788       if (! max_loop_iterations (loop, &ni))
7789         {
7790           if (dump_enabled_p ())
7791             dump_printf_loc (MSG_NOTE, vect_location,
7792                              "loop count not known, cannot create cond "
7793                              "reduction.\n");
7794           return false;
7795         }
7796       /* Convert backedges to iterations.  */
7797       ni += 1;
7798
7799       /* The additional index will be the same type as the condition.  Check
7800          that the loop can fit into this less one (because we'll use up the
7801          zero slot for when there are no matches).  */
7802       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7803       if (wi::geu_p (ni, wi::to_widest (max_index)))
7804         {
7805           if (dump_enabled_p ())
7806             dump_printf_loc (MSG_NOTE, vect_location,
7807                              "loop size is greater than data size.\n");
7808           return false;
7809         }
7810     }
7811
7812   /* In case the vectorization factor (VF) is bigger than the number
7813      of elements that we can fit in a vectype (nunits), we have to generate
7814      more than one vector stmt - i.e - we need to "unroll" the
7815      vector stmt by a factor VF/nunits.  For more details see documentation
7816      in vectorizable_operation.  */
7817
7818   /* If the reduction is used in an outer loop we need to generate
7819      VF intermediate results, like so (e.g. for ncopies=2):
7820         r0 = phi (init, r0)
7821         r1 = phi (init, r1)
7822         r0 = x0 + r0;
7823         r1 = x1 + r1;
7824     (i.e. we generate VF results in 2 registers).
7825     In this case we have a separate def-use cycle for each copy, and therefore
7826     for each copy we get the vector def for the reduction variable from the
7827     respective phi node created for this copy.
7828
7829     Otherwise (the reduction is unused in the loop nest), we can combine
7830     together intermediate results, like so (e.g. for ncopies=2):
7831         r = phi (init, r)
7832         r = x0 + r;
7833         r = x1 + r;
7834    (i.e. we generate VF/2 results in a single register).
7835    In this case for each copy we get the vector def for the reduction variable
7836    from the vectorized reduction operation generated in the previous iteration.
7837
7838    This only works when we see both the reduction PHI and its only consumer
7839    in vectorizable_reduction and there are no intermediate stmts
7840    participating.  When unrolling we want each unrolled iteration to have its
7841    own reduction accumulator since one of the main goals of unrolling a
7842    reduction is to reduce the aggregate loop-carried latency.  */
7843   if (ncopies > 1
7844       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7845       && reduc_chain_length == 1
7846       && loop_vinfo->suggested_unroll_factor == 1)
7847     single_defuse_cycle = true;
7848
7849   if (single_defuse_cycle || lane_reduc_code_p)
7850     {
7851       gcc_assert (op.code != COND_EXPR);
7852
7853       /* 4. Supportable by target?  */
7854       bool ok = true;
7855
7856       /* 4.1. check support for the operation in the loop
7857
7858          This isn't necessary for the lane reduction codes, since they
7859          can only be produced by pattern matching, and it's up to the
7860          pattern matcher to test for support.  The main reason for
7861          specifically skipping this step is to avoid rechecking whether
7862          mixed-sign dot-products can be implemented using signed
7863          dot-products.  */
7864       machine_mode vec_mode = TYPE_MODE (vectype_in);
7865       if (!lane_reduc_code_p
7866           && !directly_supported_p (op.code, vectype_in, optab_vector))
7867         {
7868           if (dump_enabled_p ())
7869             dump_printf (MSG_NOTE, "op not supported by target.\n");
7870           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7871               || !vect_can_vectorize_without_simd_p (op.code))
7872             ok = false;
7873           else
7874             if (dump_enabled_p ())
7875               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7876         }
7877
7878       if (vect_emulated_vector_p (vectype_in)
7879           && !vect_can_vectorize_without_simd_p (op.code))
7880         {
7881           if (dump_enabled_p ())
7882             dump_printf (MSG_NOTE, "using word mode not possible.\n");
7883           return false;
7884         }
7885
7886       /* lane-reducing operations have to go through vect_transform_reduction.
7887          For the other cases try without the single cycle optimization.  */
7888       if (!ok)
7889         {
7890           if (lane_reduc_code_p)
7891             return false;
7892           else
7893             single_defuse_cycle = false;
7894         }
7895     }
7896   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7897
7898   /* If the reduction stmt is one of the patterns that have lane
7899      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7900   if ((ncopies > 1 && ! single_defuse_cycle)
7901       && lane_reduc_code_p)
7902     {
7903       if (dump_enabled_p ())
7904         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7905                          "multi def-use cycle not possible for lane-reducing "
7906                          "reduction operation\n");
7907       return false;
7908     }
7909
7910   if (slp_node
7911       && !(!single_defuse_cycle
7912            && !lane_reduc_code_p
7913            && reduction_type != FOLD_LEFT_REDUCTION))
7914     for (i = 0; i < (int) op.num_ops; i++)
7915       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
7916         {
7917           if (dump_enabled_p ())
7918             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7919                              "incompatible vector types for invariants\n");
7920           return false;
7921         }
7922
7923   if (slp_node)
7924     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7925   else
7926     vec_num = 1;
7927
7928   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7929                              reduction_type, ncopies, cost_vec);
7930   /* Cost the reduction op inside the loop if transformed via
7931      vect_transform_reduction.  Otherwise this is costed by the
7932      separate vectorizable_* routines.  */
7933   if (single_defuse_cycle || lane_reduc_code_p)
7934     {
7935       int factor = 1;
7936       if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
7937         /* Three dot-products and a subtraction.  */
7938         factor = 4;
7939       record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
7940                         stmt_info, 0, vect_body);
7941     }
7942
7943   if (dump_enabled_p ()
7944       && reduction_type == FOLD_LEFT_REDUCTION)
7945     dump_printf_loc (MSG_NOTE, vect_location,
7946                      "using an in-order (fold-left) reduction.\n");
7947   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7948   /* All but single defuse-cycle optimized, lane-reducing and fold-left
7949      reductions go through their own vectorizable_* routines.  */
7950   if (!single_defuse_cycle
7951       && !lane_reduc_code_p
7952       && reduction_type != FOLD_LEFT_REDUCTION)
7953     {
7954       stmt_vec_info tem
7955         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7956       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7957         {
7958           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7959           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7960         }
7961       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7962       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7963     }
7964   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7965     {
7966       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7967       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
7968
7969       if (reduction_type != FOLD_LEFT_REDUCTION
7970           && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
7971           && (cond_fn == IFN_LAST
7972               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7973                                                   OPTIMIZE_FOR_SPEED)))
7974         {
7975           if (dump_enabled_p ())
7976             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7977                              "can't operate on partial vectors because"
7978                              " no conditional operation is available.\n");
7979           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7980         }
7981       else if (reduction_type == FOLD_LEFT_REDUCTION
7982                && reduc_fn == IFN_LAST
7983                && !expand_vec_cond_expr_p (vectype_in,
7984                                            truth_type_for (vectype_in),
7985                                            SSA_NAME))
7986         {
7987           if (dump_enabled_p ())
7988             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7989                              "can't operate on partial vectors because"
7990                              " no conditional operation is available.\n");
7991           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7992         }
7993       else
7994         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7995                                vectype_in, NULL);
7996     }
7997   return true;
7998 }
7999
8000 /* STMT_INFO is a dot-product reduction whose multiplication operands
8001    have different signs.  Emit a sequence to emulate the operation
8002    using a series of signed DOT_PROD_EXPRs and return the last
8003    statement generated.  VEC_DEST is the result of the vector operation
8004    and VOP lists its inputs.  */
8005
8006 static gassign *
8007 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8008                              gimple_stmt_iterator *gsi, tree vec_dest,
8009                              tree vop[3])
8010 {
8011   tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8012   tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8013   tree narrow_elttype = TREE_TYPE (narrow_vectype);
8014   gimple *new_stmt;
8015
8016   /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
8017   if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8018     std::swap (vop[0], vop[1]);
8019
8020   /* Convert all inputs to signed types.  */
8021   for (int i = 0; i < 3; ++i)
8022     if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8023       {
8024         tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8025         new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8026         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8027         vop[i] = tmp;
8028       }
8029
8030   /* In the comments below we assume 8-bit inputs for simplicity,
8031      but the approach works for any full integer type.  */
8032
8033   /* Create a vector of -128.  */
8034   tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8035   tree min_narrow = build_vector_from_val (narrow_vectype,
8036                                            min_narrow_elttype);
8037
8038   /* Create a vector of 64.  */
8039   auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8040   tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8041   half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8042
8043   /* Emit: SUB_RES = VOP[0] - 128.  */
8044   tree sub_res = make_ssa_name (narrow_vectype);
8045   new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8046   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8047
8048   /* Emit:
8049
8050        STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8051        STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8052        STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8053
8054      on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8055      Doing the two 64 * y steps first allows more time to compute x.  */
8056   tree stage1 = make_ssa_name (wide_vectype);
8057   new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8058                                   vop[1], half_narrow, vop[2]);
8059   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8060
8061   tree stage2 = make_ssa_name (wide_vectype);
8062   new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8063                                   vop[1], half_narrow, stage1);
8064   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8065
8066   tree stage3 = make_ssa_name (wide_vectype);
8067   new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8068                                   sub_res, vop[1], stage2);
8069   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8070
8071   /* Convert STAGE3 to the reduction type.  */
8072   return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8073 }
8074
8075 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8076    value.  */
8077
8078 bool
8079 vect_transform_reduction (loop_vec_info loop_vinfo,
8080                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8081                           gimple **vec_stmt, slp_tree slp_node)
8082 {
8083   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8084   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8085   int i;
8086   int ncopies;
8087   int vec_num;
8088
8089   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8090   gcc_assert (reduc_info->is_reduc_info);
8091
8092   if (nested_in_vect_loop_p (loop, stmt_info))
8093     {
8094       loop = loop->inner;
8095       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8096     }
8097
8098   gimple_match_op op;
8099   if (!gimple_extract_op (stmt_info->stmt, &op))
8100     gcc_unreachable ();
8101
8102   /* All uses but the last are expected to be defined in the loop.
8103      The last use is the reduction variable.  In case of nested cycle this
8104      assumption is not true: we use reduc_index to record the index of the
8105      reduction variable.  */
8106   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8107   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8108   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8109   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8110
8111   if (slp_node)
8112     {
8113       ncopies = 1;
8114       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8115     }
8116   else
8117     {
8118       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8119       vec_num = 1;
8120     }
8121
8122   code_helper code = canonicalize_code (op.code, op.type);
8123   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8124   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8125   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8126
8127   /* Transform.  */
8128   tree new_temp = NULL_TREE;
8129   auto_vec<tree> vec_oprnds0;
8130   auto_vec<tree> vec_oprnds1;
8131   auto_vec<tree> vec_oprnds2;
8132   tree def0;
8133
8134   if (dump_enabled_p ())
8135     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8136
8137   /* FORNOW: Multiple types are not supported for condition.  */
8138   if (code == COND_EXPR)
8139     gcc_assert (ncopies == 1);
8140
8141   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8142
8143   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8144   if (reduction_type == FOLD_LEFT_REDUCTION)
8145     {
8146       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8147       gcc_assert (code.is_tree_code ());
8148       return vectorize_fold_left_reduction
8149           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8150            tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks);
8151     }
8152
8153   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8154   gcc_assert (single_defuse_cycle
8155               || code == DOT_PROD_EXPR
8156               || code == WIDEN_SUM_EXPR
8157               || code == SAD_EXPR);
8158
8159   /* Create the destination vector  */
8160   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8161   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8162
8163   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8164                      single_defuse_cycle && reduc_index == 0
8165                      ? NULL_TREE : op.ops[0], &vec_oprnds0,
8166                      single_defuse_cycle && reduc_index == 1
8167                      ? NULL_TREE : op.ops[1], &vec_oprnds1,
8168                      op.num_ops == 3
8169                      && !(single_defuse_cycle && reduc_index == 2)
8170                      ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8171   if (single_defuse_cycle)
8172     {
8173       gcc_assert (!slp_node);
8174       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8175                                      op.ops[reduc_index],
8176                                      reduc_index == 0 ? &vec_oprnds0
8177                                      : (reduc_index == 1 ? &vec_oprnds1
8178                                         : &vec_oprnds2));
8179     }
8180
8181   bool emulated_mixed_dot_prod
8182     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8183   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8184     {
8185       gimple *new_stmt;
8186       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8187       if (masked_loop_p && !mask_by_cond_expr)
8188         {
8189           /* No conditional ifns have been defined for dot-product yet.  */
8190           gcc_assert (code != DOT_PROD_EXPR);
8191
8192           /* Make sure that the reduction accumulator is vop[0].  */
8193           if (reduc_index == 1)
8194             {
8195               gcc_assert (commutative_binary_op_p (code, op.type));
8196               std::swap (vop[0], vop[1]);
8197             }
8198           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8199                                           vec_num * ncopies, vectype_in, i);
8200           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8201                                                     vop[0], vop[1], vop[0]);
8202           new_temp = make_ssa_name (vec_dest, call);
8203           gimple_call_set_lhs (call, new_temp);
8204           gimple_call_set_nothrow (call, true);
8205           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8206           new_stmt = call;
8207         }
8208       else
8209         {
8210           if (op.num_ops == 3)
8211             vop[2] = vec_oprnds2[i];
8212
8213           if (masked_loop_p && mask_by_cond_expr)
8214             {
8215               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8216                                               vec_num * ncopies, vectype_in, i);
8217               build_vect_cond_expr (code, vop, mask, gsi);
8218             }
8219
8220           if (emulated_mixed_dot_prod)
8221             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8222                                                     vec_dest, vop);
8223           else if (code.is_internal_fn ())
8224             new_stmt = gimple_build_call_internal (internal_fn (code),
8225                                                    op.num_ops,
8226                                                    vop[0], vop[1], vop[2]);
8227           else
8228             new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8229                                             vop[0], vop[1], vop[2]);
8230           new_temp = make_ssa_name (vec_dest, new_stmt);
8231           gimple_set_lhs (new_stmt, new_temp);
8232           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8233         }
8234
8235       if (slp_node)
8236         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
8237       else if (single_defuse_cycle
8238                && i < ncopies - 1)
8239         {
8240           if (reduc_index == 0)
8241             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8242           else if (reduc_index == 1)
8243             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8244           else if (reduc_index == 2)
8245             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8246         }
8247       else
8248         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8249     }
8250
8251   if (!slp_node)
8252     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8253
8254   return true;
8255 }
8256
8257 /* Transform phase of a cycle PHI.  */
8258
8259 bool
8260 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8261                           stmt_vec_info stmt_info, gimple **vec_stmt,
8262                           slp_tree slp_node, slp_instance slp_node_instance)
8263 {
8264   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8265   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8266   int i;
8267   int ncopies;
8268   int j;
8269   bool nested_cycle = false;
8270   int vec_num;
8271
8272   if (nested_in_vect_loop_p (loop, stmt_info))
8273     {
8274       loop = loop->inner;
8275       nested_cycle = true;
8276     }
8277
8278   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8279   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8280   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8281   gcc_assert (reduc_info->is_reduc_info);
8282
8283   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8284       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8285     /* Leave the scalar phi in place.  */
8286     return true;
8287
8288   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8289   /* For a nested cycle we do not fill the above.  */
8290   if (!vectype_in)
8291     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8292   gcc_assert (vectype_in);
8293
8294   if (slp_node)
8295     {
8296       /* The size vect_schedule_slp_instance computes is off for us.  */
8297       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8298                                       * SLP_TREE_LANES (slp_node), vectype_in);
8299       ncopies = 1;
8300     }
8301   else
8302     {
8303       vec_num = 1;
8304       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8305     }
8306
8307   /* Check whether we should use a single PHI node and accumulate
8308      vectors to one before the backedge.  */
8309   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8310     ncopies = 1;
8311
8312   /* Create the destination vector  */
8313   gphi *phi = as_a <gphi *> (stmt_info->stmt);
8314   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8315                                                vectype_out);
8316
8317   /* Get the loop-entry arguments.  */
8318   tree vec_initial_def = NULL_TREE;
8319   auto_vec<tree> vec_initial_defs;
8320   if (slp_node)
8321     {
8322       vec_initial_defs.reserve (vec_num);
8323       if (nested_cycle)
8324         {
8325           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8326           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8327                              &vec_initial_defs);
8328         }
8329       else
8330         {
8331           gcc_assert (slp_node == slp_node_instance->reduc_phis);
8332           vec<tree> &initial_values = reduc_info->reduc_initial_values;
8333           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8334
8335           unsigned int num_phis = stmts.length ();
8336           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8337             num_phis = 1;
8338           initial_values.reserve (num_phis);
8339           for (unsigned int i = 0; i < num_phis; ++i)
8340             {
8341               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8342               initial_values.quick_push (vect_phi_initial_value (this_phi));
8343             }
8344           if (vec_num == 1)
8345             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8346           if (!initial_values.is_empty ())
8347             {
8348               tree initial_value
8349                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8350               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8351               tree neutral_op
8352                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8353                                             code, initial_value);
8354               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8355                                               &vec_initial_defs, vec_num,
8356                                               stmts.length (), neutral_op);
8357             }
8358         }
8359     }
8360   else
8361     {
8362       /* Get at the scalar def before the loop, that defines the initial
8363          value of the reduction variable.  */
8364       tree initial_def = vect_phi_initial_value (phi);
8365       reduc_info->reduc_initial_values.safe_push (initial_def);
8366       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8367          and we can't use zero for induc_val, use initial_def.  Similarly
8368          for REDUC_MIN and initial_def larger than the base.  */
8369       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8370         {
8371           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8372           if (TREE_CODE (initial_def) == INTEGER_CST
8373               && !integer_zerop (induc_val)
8374               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8375                    && tree_int_cst_lt (initial_def, induc_val))
8376                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8377                       && tree_int_cst_lt (induc_val, initial_def))))
8378             {
8379               induc_val = initial_def;
8380               /* Communicate we used the initial_def to epilouge
8381                  generation.  */
8382               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8383             }
8384           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8385         }
8386       else if (nested_cycle)
8387         {
8388           /* Do not use an adjustment def as that case is not supported
8389              correctly if ncopies is not one.  */
8390           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8391                                          ncopies, initial_def,
8392                                          &vec_initial_defs);
8393         }
8394       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8395                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8396         /* Fill the initial vector with the initial scalar value.  */
8397         vec_initial_def
8398           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8399                                            initial_def, initial_def);
8400       else
8401         {
8402           if (ncopies == 1)
8403             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8404           if (!reduc_info->reduc_initial_values.is_empty ())
8405             {
8406               initial_def = reduc_info->reduc_initial_values[0];
8407               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8408               tree neutral_op
8409                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8410                                             code, initial_def);
8411               gcc_assert (neutral_op);
8412               /* Try to simplify the vector initialization by applying an
8413                  adjustment after the reduction has been performed.  */
8414               if (!reduc_info->reused_accumulator
8415                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8416                   && !operand_equal_p (neutral_op, initial_def))
8417                 {
8418                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8419                     = initial_def;
8420                   initial_def = neutral_op;
8421                 }
8422               vec_initial_def
8423                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8424                                                  initial_def, neutral_op);
8425             }
8426         }
8427     }
8428
8429   if (vec_initial_def)
8430     {
8431       vec_initial_defs.create (ncopies);
8432       for (i = 0; i < ncopies; ++i)
8433         vec_initial_defs.quick_push (vec_initial_def);
8434     }
8435
8436   if (auto *accumulator = reduc_info->reused_accumulator)
8437     {
8438       tree def = accumulator->reduc_input;
8439       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8440         {
8441           unsigned int nreduc;
8442           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8443                                             (TREE_TYPE (def)),
8444                                           TYPE_VECTOR_SUBPARTS (vectype_out),
8445                                           &nreduc);
8446           gcc_assert (res);
8447           gimple_seq stmts = NULL;
8448           /* Reduce the single vector to a smaller one.  */
8449           if (nreduc != 1)
8450             {
8451               /* Perform the reduction in the appropriate type.  */
8452               tree rvectype = vectype_out;
8453               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8454                                               TREE_TYPE (TREE_TYPE (def))))
8455                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8456                                               TYPE_VECTOR_SUBPARTS
8457                                                 (vectype_out));
8458               def = vect_create_partial_epilog (def, rvectype,
8459                                                 STMT_VINFO_REDUC_CODE
8460                                                   (reduc_info),
8461                                                 &stmts);
8462             }
8463           /* The epilogue loop might use a different vector mode, like
8464              VNx2DI vs. V2DI.  */
8465           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8466             {
8467               tree reduc_type = build_vector_type_for_mode
8468                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8469               def = gimple_convert (&stmts, reduc_type, def);
8470             }
8471           /* Adjust the input so we pick up the partially reduced value
8472              for the skip edge in vect_create_epilog_for_reduction.  */
8473           accumulator->reduc_input = def;
8474           /* And the reduction could be carried out using a different sign.  */
8475           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8476             def = gimple_convert (&stmts, vectype_out, def);
8477           if (loop_vinfo->main_loop_edge)
8478             {
8479               /* While we'd like to insert on the edge this will split
8480                  blocks and disturb bookkeeping, we also will eventually
8481                  need this on the skip edge.  Rely on sinking to
8482                  fixup optimal placement and insert in the pred.  */
8483               gimple_stmt_iterator gsi
8484                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8485               /* Insert before a cond that eventually skips the
8486                  epilogue.  */
8487               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8488                 gsi_prev (&gsi);
8489               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8490             }
8491           else
8492             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8493                                               stmts);
8494         }
8495       if (loop_vinfo->main_loop_edge)
8496         vec_initial_defs[0]
8497           = vect_get_main_loop_result (loop_vinfo, def,
8498                                        vec_initial_defs[0]);
8499       else
8500         vec_initial_defs.safe_push (def);
8501     }
8502
8503   /* Generate the reduction PHIs upfront.  */
8504   for (i = 0; i < vec_num; i++)
8505     {
8506       tree vec_init_def = vec_initial_defs[i];
8507       for (j = 0; j < ncopies; j++)
8508         {
8509           /* Create the reduction-phi that defines the reduction
8510              operand.  */
8511           gphi *new_phi = create_phi_node (vec_dest, loop->header);
8512
8513           /* Set the loop-entry arg of the reduction-phi.  */
8514           if (j != 0 && nested_cycle)
8515             vec_init_def = vec_initial_defs[j];
8516           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8517                        UNKNOWN_LOCATION);
8518
8519           /* The loop-latch arg is set in epilogue processing.  */
8520
8521           if (slp_node)
8522             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
8523           else
8524             {
8525               if (j == 0)
8526                 *vec_stmt = new_phi;
8527               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8528             }
8529         }
8530     }
8531
8532   return true;
8533 }
8534
8535 /* Vectorizes LC PHIs.  */
8536
8537 bool
8538 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8539                      stmt_vec_info stmt_info, gimple **vec_stmt,
8540                      slp_tree slp_node)
8541 {
8542   if (!loop_vinfo
8543       || !is_a <gphi *> (stmt_info->stmt)
8544       || gimple_phi_num_args (stmt_info->stmt) != 1)
8545     return false;
8546
8547   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8548       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8549     return false;
8550
8551   if (!vec_stmt) /* transformation not required.  */
8552     {
8553       /* Deal with copies from externs or constants that disguise as
8554          loop-closed PHI nodes (PR97886).  */
8555       if (slp_node
8556           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8557                                                 SLP_TREE_VECTYPE (slp_node)))
8558         {
8559           if (dump_enabled_p ())
8560             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8561                              "incompatible vector types for invariants\n");
8562           return false;
8563         }
8564       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8565       return true;
8566     }
8567
8568   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8569   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8570   basic_block bb = gimple_bb (stmt_info->stmt);
8571   edge e = single_pred_edge (bb);
8572   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8573   auto_vec<tree> vec_oprnds;
8574   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8575                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8576                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8577   for (unsigned i = 0; i < vec_oprnds.length (); i++)
8578     {
8579       /* Create the vectorized LC PHI node.  */
8580       gphi *new_phi = create_phi_node (vec_dest, bb);
8581       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8582       if (slp_node)
8583         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
8584       else
8585         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8586     }
8587   if (!slp_node)
8588     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8589
8590   return true;
8591 }
8592
8593 /* Vectorizes PHIs.  */
8594
8595 bool
8596 vectorizable_phi (vec_info *,
8597                   stmt_vec_info stmt_info, gimple **vec_stmt,
8598                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8599 {
8600   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8601     return false;
8602
8603   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8604     return false;
8605
8606   tree vectype = SLP_TREE_VECTYPE (slp_node);
8607
8608   if (!vec_stmt) /* transformation not required.  */
8609     {
8610       slp_tree child;
8611       unsigned i;
8612       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8613         if (!child)
8614           {
8615             if (dump_enabled_p ())
8616               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8617                                "PHI node with unvectorized backedge def\n");
8618             return false;
8619           }
8620         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8621           {
8622             if (dump_enabled_p ())
8623               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8624                                "incompatible vector types for invariants\n");
8625             return false;
8626           }
8627         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8628                  && !useless_type_conversion_p (vectype,
8629                                                 SLP_TREE_VECTYPE (child)))
8630           {
8631             /* With bools we can have mask and non-mask precision vectors
8632                or different non-mask precisions.  while pattern recog is
8633                supposed to guarantee consistency here bugs in it can cause
8634                mismatches (PR103489 and PR103800 for example).
8635                Deal with them here instead of ICEing later.  */
8636             if (dump_enabled_p ())
8637               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8638                                "incompatible vector type setup from "
8639                                "bool pattern detection\n");
8640             return false;
8641           }
8642
8643       /* For single-argument PHIs assume coalescing which means zero cost
8644          for the scalar and the vector PHIs.  This avoids artificially
8645          favoring the vector path (but may pessimize it in some cases).  */
8646       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8647         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8648                           vector_stmt, stmt_info, vectype, 0, vect_body);
8649       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8650       return true;
8651     }
8652
8653   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8654   basic_block bb = gimple_bb (stmt_info->stmt);
8655   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8656   auto_vec<gphi *> new_phis;
8657   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8658     {
8659       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8660
8661       /* Skip not yet vectorized defs.  */
8662       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8663           && SLP_TREE_VEC_STMTS (child).is_empty ())
8664         continue;
8665
8666       auto_vec<tree> vec_oprnds;
8667       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8668       if (!new_phis.exists ())
8669         {
8670           new_phis.create (vec_oprnds.length ());
8671           for (unsigned j = 0; j < vec_oprnds.length (); j++)
8672             {
8673               /* Create the vectorized LC PHI node.  */
8674               new_phis.quick_push (create_phi_node (vec_dest, bb));
8675               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
8676             }
8677         }
8678       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8679       for (unsigned j = 0; j < vec_oprnds.length (); j++)
8680         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8681     }
8682   /* We should have at least one already vectorized child.  */
8683   gcc_assert (new_phis.exists ());
8684
8685   return true;
8686 }
8687
8688 /* Vectorizes first order recurrences.  An overview of the transformation
8689    is described below. Suppose we have the following loop.
8690
8691      int t = 0;
8692      for (int i = 0; i < n; ++i)
8693        {
8694          b[i] = a[i] - t;
8695          t = a[i];
8696        }
8697
8698    There is a first-order recurrence on 'a'. For this loop, the scalar IR
8699    looks (simplified) like:
8700
8701     scalar.preheader:
8702       init = 0;
8703
8704     scalar.body:
8705       i = PHI <0(scalar.preheader), i+1(scalar.body)>
8706       _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8707       _1 = a[i]
8708       b[i] = _1 - _2
8709       if (i < n) goto scalar.body
8710
8711    In this example, _2 is a recurrence because it's value depends on the
8712    previous iteration.  We vectorize this as (VF = 4)
8713
8714     vector.preheader:
8715       vect_init = vect_cst(..., ..., ..., 0)
8716
8717     vector.body
8718       i = PHI <0(vector.preheader), i+4(vector.body)>
8719       vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8720       vect_2 = a[i, i+1, i+2, i+3];
8721       vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8722       b[i, i+1, i+2, i+3] = vect_2 - vect_3
8723       if (..) goto vector.body
8724
8725    In this function, vectorizable_recurr, we code generate both the
8726    vector PHI node and the permute since those together compute the
8727    vectorized value of the scalar PHI.  We do not yet have the
8728    backedge value to fill in there nor into the vec_perm.  Those
8729    are filled in maybe_set_vectorized_backedge_value and
8730    vect_schedule_scc.
8731
8732    TODO:  Since the scalar loop does not have a use of the recurrence
8733    outside of the loop the natural way to implement peeling via
8734    vectorizing the live value doesn't work.  For now peeling of loops
8735    with a recurrence is not implemented.  For SLP the supported cases
8736    are restricted to those requiring a single vector recurrence PHI.  */
8737
8738 bool
8739 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8740                      gimple **vec_stmt, slp_tree slp_node,
8741                      stmt_vector_for_cost *cost_vec)
8742 {
8743   if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8744     return false;
8745
8746   gphi *phi = as_a<gphi *> (stmt_info->stmt);
8747
8748   /* So far we only support first-order recurrence auto-vectorization.  */
8749   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8750     return false;
8751
8752   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8753   unsigned ncopies;
8754   if (slp_node)
8755     ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8756   else
8757     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8758   poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8759   unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
8760   /* We need to be able to make progress with a single vector.  */
8761   if (maybe_gt (dist * 2, nunits))
8762     {
8763       if (dump_enabled_p ())
8764         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8765                          "first order recurrence exceeds half of "
8766                          "a vector\n");
8767       return false;
8768     }
8769
8770   /* First-order recurrence autovectorization needs to handle permutation
8771      with indices = [nunits-1, nunits, nunits+1, ...].  */
8772   vec_perm_builder sel (nunits, 1, 3);
8773   for (int i = 0; i < 3; ++i)
8774     sel.quick_push (nunits - dist + i);
8775   vec_perm_indices indices (sel, 2, nunits);
8776
8777   if (!vec_stmt) /* transformation not required.  */
8778     {
8779       if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8780                                  indices))
8781         return false;
8782
8783       if (slp_node)
8784         {
8785           /* We eventually need to set a vector type on invariant
8786              arguments.  */
8787           unsigned j;
8788           slp_tree child;
8789           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8790             if (!vect_maybe_update_slp_op_vectype
8791                   (child, SLP_TREE_VECTYPE (slp_node)))
8792               {
8793                 if (dump_enabled_p ())
8794                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8795                                    "incompatible vector types for "
8796                                    "invariants\n");
8797                 return false;
8798               }
8799         }
8800       /* The recurrence costs the initialization vector and one permute
8801          for each copy.  */
8802       unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
8803                                                  stmt_info, 0, vect_prologue);
8804       unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8805                                                stmt_info, 0, vect_body);
8806       if (dump_enabled_p ())
8807         dump_printf_loc (MSG_NOTE, vect_location,
8808                          "vectorizable_recurr: inside_cost = %d, "
8809                          "prologue_cost = %d .\n", inside_cost,
8810                          prologue_cost);
8811
8812       STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
8813       return true;
8814     }
8815
8816   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8817   basic_block bb = gimple_bb (phi);
8818   tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8819   if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
8820     {
8821       gimple_seq stmts = NULL;
8822       preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
8823       gsi_insert_seq_on_edge_immediate (pe, stmts);
8824     }
8825   tree vec_init = build_vector_from_val (vectype, preheader);
8826   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
8827
8828   /* Create the vectorized first-order PHI node.  */
8829   tree vec_dest = vect_get_new_vect_var (vectype,
8830                                          vect_simple_var, "vec_recur_");
8831   gphi *new_phi = create_phi_node (vec_dest, bb);
8832   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
8833
8834   /* Insert shuffles the first-order recurrence autovectorization.
8835        result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>.  */
8836   tree perm = vect_gen_perm_mask_checked (vectype, indices);
8837
8838   /* Insert the required permute after the latch definition.  The
8839      second and later operands are tentative and will be updated when we have
8840      vectorized the latch definition.  */
8841   edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8842   gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
8843   gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
8844   gsi_next (&gsi2);
8845
8846   for (unsigned i = 0; i < ncopies; ++i)
8847     {
8848       vec_dest = make_ssa_name (vectype);
8849       gassign *vperm
8850           = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
8851                                  i == 0 ? gimple_phi_result (new_phi) : NULL,
8852                                  NULL, perm);
8853       vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
8854
8855       if (slp_node)
8856         SLP_TREE_VEC_STMTS (slp_node).quick_push (vperm);
8857       else
8858         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
8859     }
8860
8861   if (!slp_node)
8862     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8863   return true;
8864 }
8865
8866 /* Return true if VECTYPE represents a vector that requires lowering
8867    by the vector lowering pass.  */
8868
8869 bool
8870 vect_emulated_vector_p (tree vectype)
8871 {
8872   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8873           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8874               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8875 }
8876
8877 /* Return true if we can emulate CODE on an integer mode representation
8878    of a vector.  */
8879
8880 bool
8881 vect_can_vectorize_without_simd_p (tree_code code)
8882 {
8883   switch (code)
8884     {
8885     case PLUS_EXPR:
8886     case MINUS_EXPR:
8887     case NEGATE_EXPR:
8888     case BIT_AND_EXPR:
8889     case BIT_IOR_EXPR:
8890     case BIT_XOR_EXPR:
8891     case BIT_NOT_EXPR:
8892       return true;
8893
8894     default:
8895       return false;
8896     }
8897 }
8898
8899 /* Likewise, but taking a code_helper.  */
8900
8901 bool
8902 vect_can_vectorize_without_simd_p (code_helper code)
8903 {
8904   return (code.is_tree_code ()
8905           && vect_can_vectorize_without_simd_p (tree_code (code)));
8906 }
8907
8908 /* Create vector init for vectorized iv.  */
8909 static tree
8910 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8911                                tree step_expr, poly_uint64 nunits,
8912                                tree vectype,
8913                                enum vect_induction_op_type induction_type)
8914 {
8915   unsigned HOST_WIDE_INT const_nunits;
8916   tree vec_shift, vec_init, new_name;
8917   unsigned i;
8918   tree itype = TREE_TYPE (vectype);
8919
8920   /* iv_loop is the loop to be vectorized. Create:
8921      vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr).  */
8922   new_name = gimple_convert (stmts, itype, init_expr);
8923   switch (induction_type)
8924     {
8925     case vect_step_op_shr:
8926     case vect_step_op_shl:
8927       /* Build the Initial value from shift_expr.  */
8928       vec_init = gimple_build_vector_from_val (stmts,
8929                                                vectype,
8930                                                new_name);
8931       vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
8932                                 build_zero_cst (itype), step_expr);
8933       vec_init = gimple_build (stmts,
8934                                (induction_type == vect_step_op_shr
8935                                 ? RSHIFT_EXPR : LSHIFT_EXPR),
8936                                vectype, vec_init, vec_shift);
8937       break;
8938
8939     case vect_step_op_neg:
8940       {
8941         vec_init = gimple_build_vector_from_val (stmts,
8942                                                  vectype,
8943                                                  new_name);
8944         tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
8945                                      vectype, vec_init);
8946         /* The encoding has 2 interleaved stepped patterns.  */
8947         vec_perm_builder sel (nunits, 2, 3);
8948         sel.quick_grow (6);
8949         for (i = 0; i < 3; i++)
8950           {
8951             sel[2 * i] = i;
8952             sel[2 * i + 1] = i + nunits;
8953           }
8954         vec_perm_indices indices (sel, 2, nunits);
8955         /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
8956            fail when vec_init is const vector. In that situation vec_perm is not
8957            really needed.  */
8958         tree perm_mask_even
8959           = vect_gen_perm_mask_any (vectype, indices);
8960         vec_init = gimple_build (stmts, VEC_PERM_EXPR,
8961                                  vectype,
8962                                  vec_init, vec_neg,
8963                                  perm_mask_even);
8964       }
8965       break;
8966
8967     case vect_step_op_mul:
8968       {
8969         /* Use unsigned mult to avoid UD integer overflow.  */
8970         gcc_assert (nunits.is_constant (&const_nunits));
8971         tree utype = unsigned_type_for (itype);
8972         tree uvectype = build_vector_type (utype,
8973                                            TYPE_VECTOR_SUBPARTS (vectype));
8974         new_name = gimple_convert (stmts, utype, new_name);
8975         vec_init = gimple_build_vector_from_val (stmts,
8976                                                  uvectype,
8977                                                  new_name);
8978         tree_vector_builder elts (uvectype, const_nunits, 1);
8979         tree elt_step = build_one_cst (utype);
8980
8981         elts.quick_push (elt_step);
8982         for (i = 1; i < const_nunits; i++)
8983           {
8984             /* Create: new_name_i = new_name + step_expr.  */
8985             elt_step = gimple_build (stmts, MULT_EXPR,
8986                                      utype, elt_step, step_expr);
8987             elts.quick_push (elt_step);
8988           }
8989         /* Create a vector from [new_name_0, new_name_1, ...,
8990            new_name_nunits-1].  */
8991         tree vec_mul = gimple_build_vector (stmts, &elts);
8992         vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
8993                                  vec_init, vec_mul);
8994         vec_init = gimple_convert (stmts, vectype, vec_init);
8995       }
8996       break;
8997
8998     default:
8999       gcc_unreachable ();
9000     }
9001
9002   return vec_init;
9003 }
9004
9005 /* Peel init_expr by skip_niter for induction_type.  */
9006 tree
9007 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9008                              tree skip_niters, tree step_expr,
9009                              enum vect_induction_op_type induction_type)
9010 {
9011   gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9012   tree type = TREE_TYPE (init_expr);
9013   unsigned prec = TYPE_PRECISION (type);
9014   switch (induction_type)
9015     {
9016     case vect_step_op_neg:
9017       if (TREE_INT_CST_LOW (skip_niters) % 2)
9018         init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9019       /* else no change.  */
9020       break;
9021
9022     case vect_step_op_shr:
9023     case vect_step_op_shl:
9024       skip_niters = gimple_convert (stmts, type, skip_niters);
9025       step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9026       /* When shift mount >= precision, need to avoid UD.
9027          In the original loop, there's no UD, and according to semantic,
9028          init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr.  */
9029       if (!tree_fits_uhwi_p (step_expr)
9030           || tree_to_uhwi (step_expr) >= prec)
9031         {
9032           if (induction_type == vect_step_op_shl
9033               || TYPE_UNSIGNED (type))
9034             init_expr = build_zero_cst (type);
9035           else
9036             init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9037                                       init_expr,
9038                                       wide_int_to_tree (type, prec - 1));
9039         }
9040       else
9041         init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9042                                           ? RSHIFT_EXPR : LSHIFT_EXPR),
9043                                   type, init_expr, step_expr);
9044       break;
9045
9046     case vect_step_op_mul:
9047       {
9048         tree utype = unsigned_type_for (type);
9049         init_expr = gimple_convert (stmts, utype, init_expr);
9050         unsigned skipn = TREE_INT_CST_LOW (skip_niters);
9051         wide_int begin = wi::to_wide (step_expr);
9052         for (unsigned i = 0; i != skipn - 1; i++)
9053           begin = wi::mul (begin, wi::to_wide (step_expr));
9054         tree mult_expr = wide_int_to_tree (utype, begin);
9055         init_expr = gimple_build (stmts, MULT_EXPR, utype, init_expr, mult_expr);
9056         init_expr = gimple_convert (stmts, type, init_expr);
9057       }
9058       break;
9059
9060     default:
9061       gcc_unreachable ();
9062     }
9063
9064   return init_expr;
9065 }
9066
9067 /* Create vector step for vectorized iv.  */
9068 static tree
9069 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9070                                poly_uint64 vf,
9071                                enum vect_induction_op_type induction_type)
9072 {
9073   tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9074   tree new_name = NULL;
9075   /* Step should be pow (step, vf) for mult induction.  */
9076   if (induction_type == vect_step_op_mul)
9077     {
9078       gcc_assert (vf.is_constant ());
9079       wide_int begin = wi::to_wide (step_expr);
9080
9081       for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9082         begin = wi::mul (begin, wi::to_wide (step_expr));
9083
9084       new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9085     }
9086   else if (induction_type == vect_step_op_neg)
9087     /* Do nothing.  */
9088     ;
9089   else
9090     new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9091                              expr, step_expr);
9092   return new_name;
9093 }
9094
9095 static tree
9096 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9097                                    stmt_vec_info stmt_info,
9098                                    tree new_name, tree vectype,
9099                                    enum vect_induction_op_type induction_type)
9100 {
9101   /* No step is needed for neg induction.  */
9102   if (induction_type == vect_step_op_neg)
9103     return NULL;
9104
9105   tree t = unshare_expr (new_name);
9106   gcc_assert (CONSTANT_CLASS_P (new_name)
9107               || TREE_CODE (new_name) == SSA_NAME);
9108   tree new_vec = build_vector_from_val (vectype, t);
9109   tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9110                                     new_vec, vectype, NULL);
9111   return vec_step;
9112 }
9113
9114 /* Update vectorized iv with vect_step, induc_def is init.  */
9115 static tree
9116 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9117                           tree induc_def, tree vec_step,
9118                           enum vect_induction_op_type induction_type)
9119 {
9120   tree vec_def = induc_def;
9121   switch (induction_type)
9122     {
9123     case vect_step_op_mul:
9124       {
9125         /* Use unsigned mult to avoid UD integer overflow.  */
9126         tree uvectype
9127           = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9128                                TYPE_VECTOR_SUBPARTS (vectype));
9129         vec_def = gimple_convert (stmts, uvectype, vec_def);
9130         vec_step = gimple_convert (stmts, uvectype, vec_step);
9131         vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9132                                 vec_def, vec_step);
9133         vec_def = gimple_convert (stmts, vectype, vec_def);
9134       }
9135       break;
9136
9137     case vect_step_op_shr:
9138       vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9139                               vec_def, vec_step);
9140       break;
9141
9142     case vect_step_op_shl:
9143       vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9144                               vec_def, vec_step);
9145       break;
9146     case vect_step_op_neg:
9147       vec_def = induc_def;
9148       /* Do nothing.  */
9149       break;
9150     default:
9151       gcc_unreachable ();
9152     }
9153
9154   return vec_def;
9155
9156 }
9157
9158 /* Function vectorizable_induction
9159
9160    Check if STMT_INFO performs an nonlinear induction computation that can be
9161    vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9162    a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9163    basic block.
9164    Return true if STMT_INFO is vectorizable in this way.  */
9165
9166 static bool
9167 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9168                                   stmt_vec_info stmt_info,
9169                                   gimple **vec_stmt, slp_tree slp_node,
9170                                   stmt_vector_for_cost *cost_vec)
9171 {
9172   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9173   unsigned ncopies;
9174   bool nested_in_vect_loop = false;
9175   class loop *iv_loop;
9176   tree vec_def;
9177   edge pe = loop_preheader_edge (loop);
9178   basic_block new_bb;
9179   tree vec_init, vec_step;
9180   tree new_name;
9181   gimple *new_stmt;
9182   gphi *induction_phi;
9183   tree induc_def, vec_dest;
9184   tree init_expr, step_expr;
9185   tree niters_skip;
9186   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9187   unsigned i;
9188   gimple_stmt_iterator si;
9189
9190   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9191
9192   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9193   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9194   enum vect_induction_op_type induction_type
9195     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9196
9197   gcc_assert (induction_type > vect_step_op_add);
9198
9199   if (slp_node)
9200     ncopies = 1;
9201   else
9202     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9203   gcc_assert (ncopies >= 1);
9204
9205   /* FORNOW. Only handle nonlinear induction in the same loop.  */
9206   if (nested_in_vect_loop_p (loop, stmt_info))
9207     {
9208       if (dump_enabled_p ())
9209         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9210                          "nonlinear induction in nested loop.\n");
9211       return false;
9212     }
9213
9214   iv_loop = loop;
9215   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9216
9217   /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9218      update for each iv and a permutation to generate wanted vector iv.  */
9219   if (slp_node)
9220     {
9221       if (dump_enabled_p ())
9222         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9223                          "SLP induction not supported for nonlinear"
9224                          " induction.\n");
9225       return false;
9226     }
9227
9228   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9229     {
9230       if (dump_enabled_p ())
9231         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9232                          "floating point nonlinear induction vectorization"
9233                          " not supported.\n");
9234       return false;
9235     }
9236
9237   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9238   init_expr = vect_phi_initial_value (phi);
9239   gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9240               && TREE_CODE (step_expr) == INTEGER_CST);
9241   /* step_expr should be aligned with init_expr,
9242      .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used.  */
9243   step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9244
9245   if (TREE_CODE (init_expr) == INTEGER_CST)
9246     init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9247   else
9248     gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
9249                                        TREE_TYPE (init_expr)));
9250
9251   switch (induction_type)
9252     {
9253     case vect_step_op_neg:
9254       if (TREE_CODE (init_expr) != INTEGER_CST
9255           && TREE_CODE (init_expr) != REAL_CST)
9256         {
9257           /* Check for backend support of NEGATE_EXPR and vec_perm.  */
9258           if (!directly_supported_p (NEGATE_EXPR, vectype))
9259             return false;
9260
9261           /* The encoding has 2 interleaved stepped patterns.  */
9262           vec_perm_builder sel (nunits, 2, 3);
9263           machine_mode mode = TYPE_MODE (vectype);
9264           sel.quick_grow (6);
9265           for (i = 0; i < 3; i++)
9266             {
9267               sel[i * 2] = i;
9268               sel[i * 2 + 1] = i + nunits;
9269             }
9270           vec_perm_indices indices (sel, 2, nunits);
9271           if (!can_vec_perm_const_p (mode, mode, indices))
9272             return false;
9273         }
9274       break;
9275
9276     case vect_step_op_mul:
9277       {
9278         /* Check for backend support of MULT_EXPR.  */
9279         if (!directly_supported_p (MULT_EXPR, vectype))
9280           return false;
9281
9282         /* ?? How to construct vector step for variable number vector.
9283            [ 1, step, pow (step, 2), pow (step, 4), .. ].  */
9284         if (!vf.is_constant ())
9285           return false;
9286       }
9287       break;
9288
9289     case vect_step_op_shr:
9290       /* Check for backend support of RSHIFT_EXPR.  */
9291       if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9292         return false;
9293
9294       /* Don't shift more than type precision to avoid UD.  */
9295       if (!tree_fits_uhwi_p (step_expr)
9296           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9297                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9298         return false;
9299       break;
9300
9301     case vect_step_op_shl:
9302       /* Check for backend support of RSHIFT_EXPR.  */
9303       if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9304         return false;
9305
9306       /* Don't shift more than type precision to avoid UD.  */
9307       if (!tree_fits_uhwi_p (step_expr)
9308           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9309                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9310         return false;
9311
9312       break;
9313
9314     default:
9315       gcc_unreachable ();
9316     }
9317
9318   if (!vec_stmt) /* transformation not required.  */
9319     {
9320       unsigned inside_cost = 0, prologue_cost = 0;
9321       /* loop cost for vec_loop. Neg induction doesn't have any
9322          inside_cost.  */
9323       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9324                                       stmt_info, 0, vect_body);
9325
9326       /* loop cost for vec_loop. Neg induction doesn't have any
9327          inside_cost.  */
9328       if (induction_type == vect_step_op_neg)
9329         inside_cost = 0;
9330
9331       /* prologue cost for vec_init and vec_step.  */
9332       prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9333                                         stmt_info, 0, vect_prologue);
9334
9335       if (dump_enabled_p ())
9336         dump_printf_loc (MSG_NOTE, vect_location,
9337                          "vect_model_induction_cost: inside_cost = %d, "
9338                          "prologue_cost = %d. \n", inside_cost,
9339                          prologue_cost);
9340
9341       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9342       DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9343       return true;
9344     }
9345
9346   /* Transform.  */
9347
9348   /* Compute a vector variable, initialized with the first VF values of
9349      the induction variable.  E.g., for an iv with IV_PHI='X' and
9350      evolution S, for a vector of 4 units, we want to compute:
9351      [X, X + S, X + 2*S, X + 3*S].  */
9352
9353   if (dump_enabled_p ())
9354     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9355
9356   pe = loop_preheader_edge (iv_loop);
9357   /* Find the first insertion point in the BB.  */
9358   basic_block bb = gimple_bb (phi);
9359   si = gsi_after_labels (bb);
9360
9361   gimple_seq stmts = NULL;
9362
9363   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9364   /* If we are using the loop mask to "peel" for alignment then we need
9365      to adjust the start value here.  */
9366   if (niters_skip != NULL_TREE)
9367     init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9368                                              step_expr, induction_type);
9369
9370   vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9371                                             step_expr, nunits, vectype,
9372                                             induction_type);
9373   if (stmts)
9374     {
9375       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9376       gcc_assert (!new_bb);
9377     }
9378
9379   stmts = NULL;
9380   new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9381                                             vf, induction_type);
9382   if (stmts)
9383     {
9384       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9385       gcc_assert (!new_bb);
9386     }
9387
9388   vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9389                                                 new_name, vectype,
9390                                                 induction_type);
9391   /* Create the following def-use cycle:
9392      loop prolog:
9393      vec_init = ...
9394      vec_step = ...
9395      loop:
9396      vec_iv = PHI <vec_init, vec_loop>
9397      ...
9398      STMT
9399      ...
9400      vec_loop = vec_iv + vec_step;  */
9401
9402   /* Create the induction-phi that defines the induction-operand.  */
9403   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9404   induction_phi = create_phi_node (vec_dest, iv_loop->header);
9405   induc_def = PHI_RESULT (induction_phi);
9406
9407   /* Create the iv update inside the loop.  */
9408   stmts = NULL;
9409   vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9410                                       induc_def, vec_step,
9411                                       induction_type);
9412
9413   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9414   new_stmt = SSA_NAME_DEF_STMT (vec_def);
9415
9416   /* Set the arguments of the phi node:  */
9417   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9418   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9419                UNKNOWN_LOCATION);
9420
9421   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9422   *vec_stmt = induction_phi;
9423
9424   /* In case that vectorization factor (VF) is bigger than the number
9425      of elements that we can fit in a vectype (nunits), we have to generate
9426      more than one vector stmt - i.e - we need to "unroll" the
9427      vector stmt by a factor VF/nunits.  For more details see documentation
9428      in vectorizable_operation.  */
9429
9430   if (ncopies > 1)
9431     {
9432       stmts = NULL;
9433       /* FORNOW. This restriction should be relaxed.  */
9434       gcc_assert (!nested_in_vect_loop);
9435
9436       new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9437                                                 nunits, induction_type);
9438
9439       vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9440                                                     new_name, vectype,
9441                                                     induction_type);
9442       vec_def = induc_def;
9443       for (i = 1; i < ncopies; i++)
9444         {
9445           /* vec_i = vec_prev + vec_step.  */
9446           stmts = NULL;
9447           vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9448                                               vec_def, vec_step,
9449                                               induction_type);
9450           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9451           new_stmt = SSA_NAME_DEF_STMT (vec_def);
9452           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9453         }
9454     }
9455
9456   if (dump_enabled_p ())
9457     dump_printf_loc (MSG_NOTE, vect_location,
9458                      "transform induction: created def-use cycle: %G%G",
9459                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9460
9461   return true;
9462 }
9463
9464 /* Function vectorizable_induction
9465
9466    Check if STMT_INFO performs an induction computation that can be vectorized.
9467    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9468    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9469    Return true if STMT_INFO is vectorizable in this way.  */
9470
9471 bool
9472 vectorizable_induction (loop_vec_info loop_vinfo,
9473                         stmt_vec_info stmt_info,
9474                         gimple **vec_stmt, slp_tree slp_node,
9475                         stmt_vector_for_cost *cost_vec)
9476 {
9477   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9478   unsigned ncopies;
9479   bool nested_in_vect_loop = false;
9480   class loop *iv_loop;
9481   tree vec_def;
9482   edge pe = loop_preheader_edge (loop);
9483   basic_block new_bb;
9484   tree new_vec, vec_init, vec_step, t;
9485   tree new_name;
9486   gimple *new_stmt;
9487   gphi *induction_phi;
9488   tree induc_def, vec_dest;
9489   tree init_expr, step_expr;
9490   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9491   unsigned i;
9492   tree expr;
9493   gimple_stmt_iterator si;
9494   enum vect_induction_op_type induction_type
9495     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9496
9497   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9498   if (!phi)
9499     return false;
9500
9501   if (!STMT_VINFO_RELEVANT_P (stmt_info))
9502     return false;
9503
9504   /* Make sure it was recognized as induction computation.  */
9505   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9506     return false;
9507
9508   /* Handle nonlinear induction in a separate place.  */
9509   if (induction_type != vect_step_op_add)
9510     return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9511                                              vec_stmt, slp_node, cost_vec);
9512
9513   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9514   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9515
9516   if (slp_node)
9517     ncopies = 1;
9518   else
9519     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9520   gcc_assert (ncopies >= 1);
9521
9522   /* FORNOW. These restrictions should be relaxed.  */
9523   if (nested_in_vect_loop_p (loop, stmt_info))
9524     {
9525       imm_use_iterator imm_iter;
9526       use_operand_p use_p;
9527       gimple *exit_phi;
9528       edge latch_e;
9529       tree loop_arg;
9530
9531       if (ncopies > 1)
9532         {
9533           if (dump_enabled_p ())
9534             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9535                              "multiple types in nested loop.\n");
9536           return false;
9537         }
9538
9539       exit_phi = NULL;
9540       latch_e = loop_latch_edge (loop->inner);
9541       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9542       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9543         {
9544           gimple *use_stmt = USE_STMT (use_p);
9545           if (is_gimple_debug (use_stmt))
9546             continue;
9547
9548           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9549             {
9550               exit_phi = use_stmt;
9551               break;
9552             }
9553         }
9554       if (exit_phi)
9555         {
9556           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9557           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9558                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9559             {
9560               if (dump_enabled_p ())
9561                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9562                                  "inner-loop induction only used outside "
9563                                  "of the outer vectorized loop.\n");
9564               return false;
9565             }
9566         }
9567
9568       nested_in_vect_loop = true;
9569       iv_loop = loop->inner;
9570     }
9571   else
9572     iv_loop = loop;
9573   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9574
9575   if (slp_node && !nunits.is_constant ())
9576     {
9577       /* The current SLP code creates the step value element-by-element.  */
9578       if (dump_enabled_p ())
9579         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9580                          "SLP induction not supported for variable-length"
9581                          " vectors.\n");
9582       return false;
9583     }
9584
9585   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9586     {
9587       if (dump_enabled_p ())
9588         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9589                          "floating point induction vectorization disabled\n");
9590       return false;
9591     }
9592
9593   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9594   gcc_assert (step_expr != NULL_TREE);
9595   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9596
9597   /* Check for backend support of PLUS/MINUS_EXPR. */
9598   if (!directly_supported_p (PLUS_EXPR, step_vectype)
9599       || !directly_supported_p (MINUS_EXPR, step_vectype))
9600     return false;
9601
9602   if (!vec_stmt) /* transformation not required.  */
9603     {
9604       unsigned inside_cost = 0, prologue_cost = 0;
9605       if (slp_node)
9606         {
9607           /* We eventually need to set a vector type on invariant
9608              arguments.  */
9609           unsigned j;
9610           slp_tree child;
9611           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9612             if (!vect_maybe_update_slp_op_vectype
9613                 (child, SLP_TREE_VECTYPE (slp_node)))
9614               {
9615                 if (dump_enabled_p ())
9616                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9617                                    "incompatible vector types for "
9618                                    "invariants\n");
9619                 return false;
9620               }
9621           /* loop cost for vec_loop.  */
9622           inside_cost
9623             = record_stmt_cost (cost_vec,
9624                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9625                                 vector_stmt, stmt_info, 0, vect_body);
9626           /* prologue cost for vec_init (if not nested) and step.  */
9627           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9628                                             scalar_to_vec,
9629                                             stmt_info, 0, vect_prologue);
9630         }
9631       else /* if (!slp_node) */
9632         {
9633           /* loop cost for vec_loop.  */
9634           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9635                                           stmt_info, 0, vect_body);
9636           /* prologue cost for vec_init and vec_step.  */
9637           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9638                                             stmt_info, 0, vect_prologue);
9639         }
9640       if (dump_enabled_p ())
9641         dump_printf_loc (MSG_NOTE, vect_location,
9642                          "vect_model_induction_cost: inside_cost = %d, "
9643                          "prologue_cost = %d .\n", inside_cost,
9644                          prologue_cost);
9645
9646       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9647       DUMP_VECT_SCOPE ("vectorizable_induction");
9648       return true;
9649     }
9650
9651   /* Transform.  */
9652
9653   /* Compute a vector variable, initialized with the first VF values of
9654      the induction variable.  E.g., for an iv with IV_PHI='X' and
9655      evolution S, for a vector of 4 units, we want to compute:
9656      [X, X + S, X + 2*S, X + 3*S].  */
9657
9658   if (dump_enabled_p ())
9659     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9660
9661   pe = loop_preheader_edge (iv_loop);
9662   /* Find the first insertion point in the BB.  */
9663   basic_block bb = gimple_bb (phi);
9664   si = gsi_after_labels (bb);
9665
9666   /* For SLP induction we have to generate several IVs as for example
9667      with group size 3 we need
9668        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9669        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
9670   if (slp_node)
9671     {
9672       /* Enforced above.  */
9673       unsigned int const_nunits = nunits.to_constant ();
9674
9675       /* The initial values are vectorized, but any lanes > group_size
9676          need adjustment.  */
9677       slp_tree init_node
9678         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9679
9680       /* Gather steps.  Since we do not vectorize inductions as
9681          cycles we have to reconstruct the step from SCEV data.  */
9682       unsigned group_size = SLP_TREE_LANES (slp_node);
9683       tree *steps = XALLOCAVEC (tree, group_size);
9684       tree *inits = XALLOCAVEC (tree, group_size);
9685       stmt_vec_info phi_info;
9686       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9687         {
9688           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9689           if (!init_node)
9690             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9691                                            pe->dest_idx);
9692         }
9693
9694       /* Now generate the IVs.  */
9695       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9696       gcc_assert ((const_nunits * nvects) % group_size == 0);
9697       unsigned nivs;
9698       if (nested_in_vect_loop)
9699         nivs = nvects;
9700       else
9701         {
9702           /* Compute the number of distinct IVs we need.  First reduce
9703              group_size if it is a multiple of const_nunits so we get
9704              one IV for a group_size of 4 but const_nunits 2.  */
9705           unsigned group_sizep = group_size;
9706           if (group_sizep % const_nunits == 0)
9707             group_sizep = group_sizep / const_nunits;
9708           nivs = least_common_multiple (group_sizep,
9709                                         const_nunits) / const_nunits;
9710         }
9711       tree stept = TREE_TYPE (step_vectype);
9712       tree lupdate_mul = NULL_TREE;
9713       if (!nested_in_vect_loop)
9714         {
9715           /* The number of iterations covered in one vector iteration.  */
9716           unsigned lup_mul = (nvects * const_nunits) / group_size;
9717           lupdate_mul
9718             = build_vector_from_val (step_vectype,
9719                                      SCALAR_FLOAT_TYPE_P (stept)
9720                                      ? build_real_from_wide (stept, lup_mul,
9721                                                              UNSIGNED)
9722                                      : build_int_cstu (stept, lup_mul));
9723         }
9724       tree peel_mul = NULL_TREE;
9725       gimple_seq init_stmts = NULL;
9726       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9727         {
9728           if (SCALAR_FLOAT_TYPE_P (stept))
9729             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9730                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9731           else
9732             peel_mul = gimple_convert (&init_stmts, stept,
9733                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9734           peel_mul = gimple_build_vector_from_val (&init_stmts,
9735                                                    step_vectype, peel_mul);
9736         }
9737       unsigned ivn;
9738       auto_vec<tree> vec_steps;
9739       for (ivn = 0; ivn < nivs; ++ivn)
9740         {
9741           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9742           tree_vector_builder init_elts (vectype, const_nunits, 1);
9743           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9744           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9745             {
9746               /* The scalar steps of the IVs.  */
9747               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9748               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9749               step_elts.quick_push (elt);
9750               if (!init_node)
9751                 {
9752                   /* The scalar inits of the IVs if not vectorized.  */
9753                   elt = inits[(ivn*const_nunits + eltn) % group_size];
9754                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
9755                                                   TREE_TYPE (elt)))
9756                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9757                                         TREE_TYPE (vectype), elt);
9758                   init_elts.quick_push (elt);
9759                 }
9760               /* The number of steps to add to the initial values.  */
9761               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9762               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9763                                    ? build_real_from_wide (stept,
9764                                                            mul_elt, UNSIGNED)
9765                                    : build_int_cstu (stept, mul_elt));
9766             }
9767           vec_step = gimple_build_vector (&init_stmts, &step_elts);
9768           vec_steps.safe_push (vec_step);
9769           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9770           if (peel_mul)
9771             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9772                                      step_mul, peel_mul);
9773           if (!init_node)
9774             vec_init = gimple_build_vector (&init_stmts, &init_elts);
9775
9776           /* Create the induction-phi that defines the induction-operand.  */
9777           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9778                                             "vec_iv_");
9779           induction_phi = create_phi_node (vec_dest, iv_loop->header);
9780           induc_def = PHI_RESULT (induction_phi);
9781
9782           /* Create the iv update inside the loop  */
9783           tree up = vec_step;
9784           if (lupdate_mul)
9785             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9786                                vec_step, lupdate_mul);
9787           gimple_seq stmts = NULL;
9788           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9789           vec_def = gimple_build (&stmts,
9790                                   PLUS_EXPR, step_vectype, vec_def, up);
9791           vec_def = gimple_convert (&stmts, vectype, vec_def);
9792           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9793           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9794                        UNKNOWN_LOCATION);
9795
9796           if (init_node)
9797             vec_init = vect_get_slp_vect_def (init_node, ivn);
9798           if (!nested_in_vect_loop
9799               && !integer_zerop (step_mul))
9800             {
9801               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9802               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9803                                  vec_step, step_mul);
9804               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9805                                       vec_def, up);
9806               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9807             }
9808
9809           /* Set the arguments of the phi node:  */
9810           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9811
9812           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
9813         }
9814       if (!nested_in_vect_loop)
9815         {
9816           /* Fill up to the number of vectors we need for the whole group.  */
9817           nivs = least_common_multiple (group_size,
9818                                         const_nunits) / const_nunits;
9819           vec_steps.reserve (nivs-ivn);
9820           for (; ivn < nivs; ++ivn)
9821             {
9822               SLP_TREE_VEC_STMTS (slp_node)
9823                 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
9824               vec_steps.quick_push (vec_steps[0]);
9825             }
9826         }
9827
9828       /* Re-use IVs when we can.  We are generating further vector
9829          stmts by adding VF' * stride to the IVs generated above.  */
9830       if (ivn < nvects)
9831         {
9832           unsigned vfp
9833             = least_common_multiple (group_size, const_nunits) / group_size;
9834           tree lupdate_mul
9835             = build_vector_from_val (step_vectype,
9836                                      SCALAR_FLOAT_TYPE_P (stept)
9837                                      ? build_real_from_wide (stept,
9838                                                              vfp, UNSIGNED)
9839                                      : build_int_cstu (stept, vfp));
9840           for (; ivn < nvects; ++ivn)
9841             {
9842               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
9843               tree def = gimple_get_lhs (iv);
9844               if (ivn < 2*nivs)
9845                 vec_steps[ivn - nivs]
9846                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9847                                   vec_steps[ivn - nivs], lupdate_mul);
9848               gimple_seq stmts = NULL;
9849               def = gimple_convert (&stmts, step_vectype, def);
9850               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9851                                   def, vec_steps[ivn % nivs]);
9852               def = gimple_convert (&stmts, vectype, def);
9853               if (gimple_code (iv) == GIMPLE_PHI)
9854                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9855               else
9856                 {
9857                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
9858                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
9859                 }
9860               SLP_TREE_VEC_STMTS (slp_node)
9861                 .quick_push (SSA_NAME_DEF_STMT (def));
9862             }
9863         }
9864
9865       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
9866       gcc_assert (!new_bb);
9867
9868       return true;
9869     }
9870
9871   init_expr = vect_phi_initial_value (phi);
9872
9873   gimple_seq stmts = NULL;
9874   if (!nested_in_vect_loop)
9875     {
9876       /* Convert the initial value to the IV update type.  */
9877       tree new_type = TREE_TYPE (step_expr);
9878       init_expr = gimple_convert (&stmts, new_type, init_expr);
9879
9880       /* If we are using the loop mask to "peel" for alignment then we need
9881          to adjust the start value here.  */
9882       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9883       if (skip_niters != NULL_TREE)
9884         {
9885           if (FLOAT_TYPE_P (vectype))
9886             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
9887                                         skip_niters);
9888           else
9889             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
9890           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
9891                                          skip_niters, step_expr);
9892           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
9893                                     init_expr, skip_step);
9894         }
9895     }
9896
9897   if (stmts)
9898     {
9899       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9900       gcc_assert (!new_bb);
9901     }
9902
9903   /* Create the vector that holds the initial_value of the induction.  */
9904   if (nested_in_vect_loop)
9905     {
9906       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
9907          been created during vectorization of previous stmts.  We obtain it
9908          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
9909       auto_vec<tree> vec_inits;
9910       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
9911                                      init_expr, &vec_inits);
9912       vec_init = vec_inits[0];
9913       /* If the initial value is not of proper type, convert it.  */
9914       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
9915         {
9916           new_stmt
9917             = gimple_build_assign (vect_get_new_ssa_name (vectype,
9918                                                           vect_simple_var,
9919                                                           "vec_iv_"),
9920                                    VIEW_CONVERT_EXPR,
9921                                    build1 (VIEW_CONVERT_EXPR, vectype,
9922                                            vec_init));
9923           vec_init = gimple_assign_lhs (new_stmt);
9924           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
9925                                                  new_stmt);
9926           gcc_assert (!new_bb);
9927         }
9928     }
9929   else
9930     {
9931       /* iv_loop is the loop to be vectorized. Create:
9932          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
9933       stmts = NULL;
9934       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
9935
9936       unsigned HOST_WIDE_INT const_nunits;
9937       if (nunits.is_constant (&const_nunits))
9938         {
9939           tree_vector_builder elts (step_vectype, const_nunits, 1);
9940           elts.quick_push (new_name);
9941           for (i = 1; i < const_nunits; i++)
9942             {
9943               /* Create: new_name_i = new_name + step_expr  */
9944               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
9945                                        new_name, step_expr);
9946               elts.quick_push (new_name);
9947             }
9948           /* Create a vector from [new_name_0, new_name_1, ...,
9949              new_name_nunits-1]  */
9950           vec_init = gimple_build_vector (&stmts, &elts);
9951         }
9952       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
9953         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
9954         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
9955                                  new_name, step_expr);
9956       else
9957         {
9958           /* Build:
9959                 [base, base, base, ...]
9960                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
9961           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
9962           gcc_assert (flag_associative_math);
9963           tree index = build_index_vector (step_vectype, 0, 1);
9964           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
9965                                                         new_name);
9966           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
9967                                                         step_expr);
9968           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
9969           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
9970                                    vec_init, step_vec);
9971           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9972                                    vec_init, base_vec);
9973         }
9974       vec_init = gimple_convert (&stmts, vectype, vec_init);
9975
9976       if (stmts)
9977         {
9978           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9979           gcc_assert (!new_bb);
9980         }
9981     }
9982
9983
9984   /* Create the vector that holds the step of the induction.  */
9985   if (nested_in_vect_loop)
9986     /* iv_loop is nested in the loop to be vectorized. Generate:
9987        vec_step = [S, S, S, S]  */
9988     new_name = step_expr;
9989   else
9990     {
9991       /* iv_loop is the loop to be vectorized. Generate:
9992           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
9993       gimple_seq seq = NULL;
9994       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
9995         {
9996           expr = build_int_cst (integer_type_node, vf);
9997           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
9998         }
9999       else
10000         expr = build_int_cst (TREE_TYPE (step_expr), vf);
10001       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10002                                expr, step_expr);
10003       if (seq)
10004         {
10005           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10006           gcc_assert (!new_bb);
10007         }
10008     }
10009
10010   t = unshare_expr (new_name);
10011   gcc_assert (CONSTANT_CLASS_P (new_name)
10012               || TREE_CODE (new_name) == SSA_NAME);
10013   new_vec = build_vector_from_val (step_vectype, t);
10014   vec_step = vect_init_vector (loop_vinfo, stmt_info,
10015                                new_vec, step_vectype, NULL);
10016
10017
10018   /* Create the following def-use cycle:
10019      loop prolog:
10020          vec_init = ...
10021          vec_step = ...
10022      loop:
10023          vec_iv = PHI <vec_init, vec_loop>
10024          ...
10025          STMT
10026          ...
10027          vec_loop = vec_iv + vec_step;  */
10028
10029   /* Create the induction-phi that defines the induction-operand.  */
10030   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10031   induction_phi = create_phi_node (vec_dest, iv_loop->header);
10032   induc_def = PHI_RESULT (induction_phi);
10033
10034   /* Create the iv update inside the loop  */
10035   stmts = NULL;
10036   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10037   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10038   vec_def = gimple_convert (&stmts, vectype, vec_def);
10039   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10040   new_stmt = SSA_NAME_DEF_STMT (vec_def);
10041
10042   /* Set the arguments of the phi node:  */
10043   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10044   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10045                UNKNOWN_LOCATION);
10046
10047   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10048   *vec_stmt = induction_phi;
10049
10050   /* In case that vectorization factor (VF) is bigger than the number
10051      of elements that we can fit in a vectype (nunits), we have to generate
10052      more than one vector stmt - i.e - we need to "unroll" the
10053      vector stmt by a factor VF/nunits.  For more details see documentation
10054      in vectorizable_operation.  */
10055
10056   if (ncopies > 1)
10057     {
10058       gimple_seq seq = NULL;
10059       /* FORNOW. This restriction should be relaxed.  */
10060       gcc_assert (!nested_in_vect_loop);
10061
10062       /* Create the vector that holds the step of the induction.  */
10063       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10064         {
10065           expr = build_int_cst (integer_type_node, nunits);
10066           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10067         }
10068       else
10069         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10070       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10071                                expr, step_expr);
10072       if (seq)
10073         {
10074           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10075           gcc_assert (!new_bb);
10076         }
10077
10078       t = unshare_expr (new_name);
10079       gcc_assert (CONSTANT_CLASS_P (new_name)
10080                   || TREE_CODE (new_name) == SSA_NAME);
10081       new_vec = build_vector_from_val (step_vectype, t);
10082       vec_step = vect_init_vector (loop_vinfo, stmt_info,
10083                                    new_vec, step_vectype, NULL);
10084
10085       vec_def = induc_def;
10086       for (i = 1; i < ncopies; i++)
10087         {
10088           /* vec_i = vec_prev + vec_step  */
10089           gimple_seq stmts = NULL;
10090           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10091           vec_def = gimple_build (&stmts,
10092                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
10093           vec_def = gimple_convert (&stmts, vectype, vec_def);
10094
10095           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10096           new_stmt = SSA_NAME_DEF_STMT (vec_def);
10097           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10098         }
10099     }
10100
10101   if (dump_enabled_p ())
10102     dump_printf_loc (MSG_NOTE, vect_location,
10103                      "transform induction: created def-use cycle: %G%G",
10104                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10105
10106   return true;
10107 }
10108
10109 /* Function vectorizable_live_operation.
10110
10111    STMT_INFO computes a value that is used outside the loop.  Check if
10112    it can be supported.  */
10113
10114 bool
10115 vectorizable_live_operation (vec_info *vinfo,
10116                              stmt_vec_info stmt_info,
10117                              gimple_stmt_iterator *gsi,
10118                              slp_tree slp_node, slp_instance slp_node_instance,
10119                              int slp_index, bool vec_stmt_p,
10120                              stmt_vector_for_cost *cost_vec)
10121 {
10122   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10123   imm_use_iterator imm_iter;
10124   tree lhs, lhs_type, bitsize;
10125   tree vectype = (slp_node
10126                   ? SLP_TREE_VECTYPE (slp_node)
10127                   : STMT_VINFO_VECTYPE (stmt_info));
10128   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10129   int ncopies;
10130   gimple *use_stmt;
10131   auto_vec<tree> vec_oprnds;
10132   int vec_entry = 0;
10133   poly_uint64 vec_index = 0;
10134
10135   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
10136
10137   /* If a stmt of a reduction is live, vectorize it via
10138      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
10139      validity so just trigger the transform here.  */
10140   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10141     {
10142       if (!vec_stmt_p)
10143         return true;
10144       if (slp_node)
10145         {
10146           /* For reduction chains the meta-info is attached to
10147              the group leader.  */
10148           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10149             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10150           /* For SLP reductions we vectorize the epilogue for
10151              all involved stmts together.  */
10152           else if (slp_index != 0)
10153             return true;
10154         }
10155       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10156       gcc_assert (reduc_info->is_reduc_info);
10157       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10158           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10159         return true;
10160       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10161                                         slp_node_instance);
10162       return true;
10163     }
10164
10165   /* If STMT is not relevant and it is a simple assignment and its inputs are
10166      invariant then it can remain in place, unvectorized.  The original last
10167      scalar value that it computes will be used.  */
10168   if (!STMT_VINFO_RELEVANT_P (stmt_info))
10169     {
10170       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10171       if (dump_enabled_p ())
10172         dump_printf_loc (MSG_NOTE, vect_location,
10173                          "statement is simple and uses invariant.  Leaving in "
10174                          "place.\n");
10175       return true;
10176     }
10177
10178   if (slp_node)
10179     ncopies = 1;
10180   else
10181     ncopies = vect_get_num_copies (loop_vinfo, vectype);
10182
10183   if (slp_node)
10184     {
10185       gcc_assert (slp_index >= 0);
10186
10187       /* Get the last occurrence of the scalar index from the concatenation of
10188          all the slp vectors. Calculate which slp vector it is and the index
10189          within.  */
10190       int num_scalar = SLP_TREE_LANES (slp_node);
10191       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10192       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10193
10194       /* Calculate which vector contains the result, and which lane of
10195          that vector we need.  */
10196       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10197         {
10198           if (dump_enabled_p ())
10199             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10200                              "Cannot determine which vector holds the"
10201                              " final result.\n");
10202           return false;
10203         }
10204     }
10205
10206   if (!vec_stmt_p)
10207     {
10208       /* No transformation required.  */
10209       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10210         {
10211           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10212                                                OPTIMIZE_FOR_SPEED))
10213             {
10214               if (dump_enabled_p ())
10215                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10216                                  "can't operate on partial vectors "
10217                                  "because the target doesn't support extract "
10218                                  "last reduction.\n");
10219               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10220             }
10221           else if (slp_node)
10222             {
10223               if (dump_enabled_p ())
10224                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10225                                  "can't operate on partial vectors "
10226                                  "because an SLP statement is live after "
10227                                  "the loop.\n");
10228               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10229             }
10230           else if (ncopies > 1)
10231             {
10232               if (dump_enabled_p ())
10233                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10234                                  "can't operate on partial vectors "
10235                                  "because ncopies is greater than 1.\n");
10236               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10237             }
10238           else
10239             {
10240               gcc_assert (ncopies == 1 && !slp_node);
10241               vect_record_loop_mask (loop_vinfo,
10242                                      &LOOP_VINFO_MASKS (loop_vinfo),
10243                                      1, vectype, NULL);
10244             }
10245         }
10246       /* ???  Enable for loop costing as well.  */
10247       if (!loop_vinfo)
10248         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10249                           0, vect_epilogue);
10250       return true;
10251     }
10252
10253   /* Use the lhs of the original scalar statement.  */
10254   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10255   if (dump_enabled_p ())
10256     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10257                      "stmt %G", stmt);
10258
10259   lhs = gimple_get_lhs (stmt);
10260   lhs_type = TREE_TYPE (lhs);
10261
10262   bitsize = vector_element_bits_tree (vectype);
10263
10264   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
10265   tree vec_lhs, bitstart;
10266   gimple *vec_stmt;
10267   if (slp_node)
10268     {
10269       gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
10270
10271       /* Get the correct slp vectorized stmt.  */
10272       vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
10273       vec_lhs = gimple_get_lhs (vec_stmt);
10274
10275       /* Get entry to use.  */
10276       bitstart = bitsize_int (vec_index);
10277       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10278     }
10279   else
10280     {
10281       /* For multiple copies, get the last copy.  */
10282       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10283       vec_lhs = gimple_get_lhs (vec_stmt);
10284
10285       /* Get the last lane in the vector.  */
10286       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10287     }
10288
10289   if (loop_vinfo)
10290     {
10291       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10292          requirement, insert one phi node for it.  It looks like:
10293            loop;
10294          BB:
10295            # lhs' = PHI <lhs>
10296          ==>
10297            loop;
10298          BB:
10299            # vec_lhs' = PHI <vec_lhs>
10300            new_tree = lane_extract <vec_lhs', ...>;
10301            lhs' = new_tree;  */
10302
10303       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10304       basic_block exit_bb = single_exit (loop)->dest;
10305       gcc_assert (single_pred_p (exit_bb));
10306
10307       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10308       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10309       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
10310
10311       gimple_seq stmts = NULL;
10312       tree new_tree;
10313       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10314         {
10315           /* Emit:
10316
10317                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10318
10319              where VEC_LHS is the vectorized live-out result and MASK is
10320              the loop mask for the final iteration.  */
10321           gcc_assert (ncopies == 1 && !slp_node);
10322           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10323           tree mask = vect_get_loop_mask (loop_vinfo, gsi,
10324                                           &LOOP_VINFO_MASKS (loop_vinfo),
10325                                           1, vectype, 0);
10326           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10327                                           mask, vec_lhs_phi);
10328
10329           /* Convert the extracted vector element to the scalar type.  */
10330           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10331         }
10332       else
10333         {
10334           tree bftype = TREE_TYPE (vectype);
10335           if (VECTOR_BOOLEAN_TYPE_P (vectype))
10336             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10337           new_tree = build3 (BIT_FIELD_REF, bftype,
10338                              vec_lhs_phi, bitsize, bitstart);
10339           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10340                                            &stmts, true, NULL_TREE);
10341         }
10342
10343       if (stmts)
10344         {
10345           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
10346           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10347
10348           /* Remove existing phi from lhs and create one copy from new_tree.  */
10349           tree lhs_phi = NULL_TREE;
10350           gimple_stmt_iterator gsi;
10351           for (gsi = gsi_start_phis (exit_bb);
10352                !gsi_end_p (gsi); gsi_next (&gsi))
10353             {
10354               gimple *phi = gsi_stmt (gsi);
10355               if ((gimple_phi_arg_def (phi, 0) == lhs))
10356                 {
10357                   remove_phi_node (&gsi, false);
10358                   lhs_phi = gimple_phi_result (phi);
10359                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10360                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10361                   break;
10362                 }
10363             }
10364         }
10365
10366       /* Replace use of lhs with newly computed result.  If the use stmt is a
10367          single arg PHI, just replace all uses of PHI result.  It's necessary
10368          because lcssa PHI defining lhs may be before newly inserted stmt.  */
10369       use_operand_p use_p;
10370       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10371         if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
10372             && !is_gimple_debug (use_stmt))
10373           {
10374             if (gimple_code (use_stmt) == GIMPLE_PHI
10375                 && gimple_phi_num_args (use_stmt) == 1)
10376               {
10377                 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
10378               }
10379             else
10380               {
10381                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10382                     SET_USE (use_p, new_tree);
10383               }
10384             update_stmt (use_stmt);
10385           }
10386     }
10387   else
10388     {
10389       /* For basic-block vectorization simply insert the lane-extraction.  */
10390       tree bftype = TREE_TYPE (vectype);
10391       if (VECTOR_BOOLEAN_TYPE_P (vectype))
10392         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10393       tree new_tree = build3 (BIT_FIELD_REF, bftype,
10394                               vec_lhs, bitsize, bitstart);
10395       gimple_seq stmts = NULL;
10396       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10397                                        &stmts, true, NULL_TREE);
10398       if (TREE_CODE (new_tree) == SSA_NAME
10399           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10400         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10401       if (is_a <gphi *> (vec_stmt))
10402         {
10403           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10404           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10405         }
10406       else
10407         {
10408           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10409           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10410         }
10411
10412       /* Replace use of lhs with newly computed result.  If the use stmt is a
10413          single arg PHI, just replace all uses of PHI result.  It's necessary
10414          because lcssa PHI defining lhs may be before newly inserted stmt.  */
10415       use_operand_p use_p;
10416       stmt_vec_info use_stmt_info;
10417       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10418         if (!is_gimple_debug (use_stmt)
10419             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10420                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10421           {
10422             /* ???  This can happen when the live lane ends up being
10423                used in a vector construction code-generated by an
10424                external SLP node (and code-generation for that already
10425                happened).  See gcc.dg/vect/bb-slp-47.c.
10426                Doing this is what would happen if that vector CTOR
10427                were not code-generated yet so it is not too bad.
10428                ???  In fact we'd likely want to avoid this situation
10429                in the first place.  */
10430             if (TREE_CODE (new_tree) == SSA_NAME
10431                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10432                 && gimple_code (use_stmt) != GIMPLE_PHI
10433                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10434                                                 use_stmt))
10435               {
10436                 enum tree_code code = gimple_assign_rhs_code (use_stmt);
10437                 gcc_checking_assert (code == SSA_NAME
10438                                      || code == CONSTRUCTOR
10439                                      || code == VIEW_CONVERT_EXPR
10440                                      || CONVERT_EXPR_CODE_P (code));
10441                 if (dump_enabled_p ())
10442                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10443                                    "Using original scalar computation for "
10444                                    "live lane because use preceeds vector "
10445                                    "def\n");
10446                 continue;
10447               }
10448             /* ???  It can also happen that we end up pulling a def into
10449                a loop where replacing out-of-loop uses would require
10450                a new LC SSA PHI node.  Retain the original scalar in
10451                those cases as well.  PR98064.  */
10452             if (TREE_CODE (new_tree) == SSA_NAME
10453                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10454                 && (gimple_bb (use_stmt)->loop_father
10455                     != gimple_bb (vec_stmt)->loop_father)
10456                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10457                                         gimple_bb (use_stmt)->loop_father))
10458               {
10459                 if (dump_enabled_p ())
10460                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10461                                    "Using original scalar computation for "
10462                                    "live lane because there is an out-of-loop "
10463                                    "definition for it\n");
10464                 continue;
10465               }
10466             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10467               SET_USE (use_p, new_tree);
10468             update_stmt (use_stmt);
10469           }
10470     }
10471
10472   return true;
10473 }
10474
10475 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
10476
10477 static void
10478 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10479 {
10480   ssa_op_iter op_iter;
10481   imm_use_iterator imm_iter;
10482   def_operand_p def_p;
10483   gimple *ustmt;
10484
10485   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10486     {
10487       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10488         {
10489           basic_block bb;
10490
10491           if (!is_gimple_debug (ustmt))
10492             continue;
10493
10494           bb = gimple_bb (ustmt);
10495
10496           if (!flow_bb_inside_loop_p (loop, bb))
10497             {
10498               if (gimple_debug_bind_p (ustmt))
10499                 {
10500                   if (dump_enabled_p ())
10501                     dump_printf_loc (MSG_NOTE, vect_location,
10502                                      "killing debug use\n");
10503
10504                   gimple_debug_bind_reset_value (ustmt);
10505                   update_stmt (ustmt);
10506                 }
10507               else
10508                 gcc_unreachable ();
10509             }
10510         }
10511     }
10512 }
10513
10514 /* Given loop represented by LOOP_VINFO, return true if computation of
10515    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10516    otherwise.  */
10517
10518 static bool
10519 loop_niters_no_overflow (loop_vec_info loop_vinfo)
10520 {
10521   /* Constant case.  */
10522   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10523     {
10524       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10525       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10526
10527       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10528       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10529       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10530         return true;
10531     }
10532
10533   widest_int max;
10534   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10535   /* Check the upper bound of loop niters.  */
10536   if (get_max_loop_iterations (loop, &max))
10537     {
10538       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10539       signop sgn = TYPE_SIGN (type);
10540       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10541       if (max < type_max)
10542         return true;
10543     }
10544   return false;
10545 }
10546
10547 /* Return a mask type with half the number of elements as OLD_TYPE,
10548    given that it should have mode NEW_MODE.  */
10549
10550 tree
10551 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10552 {
10553   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10554   return build_truth_vector_type_for_mode (nunits, new_mode);
10555 }
10556
10557 /* Return a mask type with twice as many elements as OLD_TYPE,
10558    given that it should have mode NEW_MODE.  */
10559
10560 tree
10561 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10562 {
10563   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10564   return build_truth_vector_type_for_mode (nunits, new_mode);
10565 }
10566
10567 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10568    contain a sequence of NVECTORS masks that each control a vector of type
10569    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
10570    these vector masks with the vector version of SCALAR_MASK.  */
10571
10572 void
10573 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10574                        unsigned int nvectors, tree vectype, tree scalar_mask)
10575 {
10576   gcc_assert (nvectors != 0);
10577
10578   if (scalar_mask)
10579     {
10580       scalar_cond_masked_key cond (scalar_mask, nvectors);
10581       loop_vinfo->scalar_cond_masked_set.add (cond);
10582     }
10583
10584   masks->mask_set.add (std::make_pair (vectype, nvectors));
10585 }
10586
10587 /* Given a complete set of masks MASKS, extract mask number INDEX
10588    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10589    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
10590
10591    See the comment above vec_loop_masks for more details about the mask
10592    arrangement.  */
10593
10594 tree
10595 vect_get_loop_mask (loop_vec_info loop_vinfo,
10596                     gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10597                     unsigned int nvectors, tree vectype, unsigned int index)
10598 {
10599   if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10600       == vect_partial_vectors_while_ult)
10601     {
10602       rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10603       tree mask_type = rgm->type;
10604
10605       /* Populate the rgroup's mask array, if this is the first time we've
10606          used it.  */
10607       if (rgm->controls.is_empty ())
10608         {
10609           rgm->controls.safe_grow_cleared (nvectors, true);
10610           for (unsigned int i = 0; i < nvectors; ++i)
10611             {
10612               tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10613               /* Provide a dummy definition until the real one is available.  */
10614               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10615               rgm->controls[i] = mask;
10616             }
10617         }
10618
10619       tree mask = rgm->controls[index];
10620       if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10621                     TYPE_VECTOR_SUBPARTS (vectype)))
10622         {
10623           /* A loop mask for data type X can be reused for data type Y
10624              if X has N times more elements than Y and if Y's elements
10625              are N times bigger than X's.  In this case each sequence
10626              of N elements in the loop mask will be all-zero or all-one.
10627              We can then view-convert the mask so that each sequence of
10628              N elements is replaced by a single element.  */
10629           gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10630                                   TYPE_VECTOR_SUBPARTS (vectype)));
10631           gimple_seq seq = NULL;
10632           mask_type = truth_type_for (vectype);
10633           mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10634           if (seq)
10635             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10636         }
10637       return mask;
10638     }
10639   else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10640            == vect_partial_vectors_avx512)
10641     {
10642       /* The number of scalars per iteration and the number of vectors are
10643          both compile-time constants.  */
10644       unsigned int nscalars_per_iter
10645         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10646                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10647
10648       rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
10649
10650       /* The stored nV is dependent on the mask type produced.  */
10651       gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10652                              TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
10653                   == rgm->factor);
10654       nvectors = rgm->factor;
10655
10656       /* Populate the rgroup's mask array, if this is the first time we've
10657          used it.  */
10658       if (rgm->controls.is_empty ())
10659         {
10660           rgm->controls.safe_grow_cleared (nvectors, true);
10661           for (unsigned int i = 0; i < nvectors; ++i)
10662             {
10663               tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
10664               /* Provide a dummy definition until the real one is available.  */
10665               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10666               rgm->controls[i] = mask;
10667             }
10668         }
10669       if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
10670                     TYPE_VECTOR_SUBPARTS (vectype)))
10671         return rgm->controls[index];
10672
10673       /* Split the vector if needed.  Since we are dealing with integer mode
10674          masks with AVX512 we can operate on the integer representation
10675          performing the whole vector shifting.  */
10676       unsigned HOST_WIDE_INT factor;
10677       bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
10678                                      TYPE_VECTOR_SUBPARTS (vectype), &factor);
10679       gcc_assert (ok);
10680       gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
10681       tree mask_type = truth_type_for (vectype);
10682       gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
10683       unsigned vi = index / factor;
10684       unsigned vpart = index % factor;
10685       tree vec = rgm->controls[vi];
10686       gimple_seq seq = NULL;
10687       vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
10688                           lang_hooks.types.type_for_mode
10689                                 (TYPE_MODE (rgm->type), 1), vec);
10690       /* For integer mode masks simply shift the right bits into position.  */
10691       if (vpart != 0)
10692         vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
10693                             build_int_cst (integer_type_node,
10694                                            (TYPE_VECTOR_SUBPARTS (vectype)
10695                                             * vpart)));
10696       vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
10697                                     (TYPE_MODE (mask_type), 1), vec);
10698       vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
10699       if (seq)
10700         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10701       return vec;
10702     }
10703   else
10704     gcc_unreachable ();
10705 }
10706
10707 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10708    lengths for controlling an operation on VECTYPE.  The operation splits
10709    each element of VECTYPE into FACTOR separate subelements, measuring the
10710    length as a number of these subelements.  */
10711
10712 void
10713 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10714                       unsigned int nvectors, tree vectype, unsigned int factor)
10715 {
10716   gcc_assert (nvectors != 0);
10717   if (lens->length () < nvectors)
10718     lens->safe_grow_cleared (nvectors, true);
10719   rgroup_controls *rgl = &(*lens)[nvectors - 1];
10720
10721   /* The number of scalars per iteration, scalar occupied bytes and
10722      the number of vectors are both compile-time constants.  */
10723   unsigned int nscalars_per_iter
10724     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10725                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10726
10727   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10728     {
10729       /* For now, we only support cases in which all loads and stores fall back
10730          to VnQI or none do.  */
10731       gcc_assert (!rgl->max_nscalars_per_iter
10732                   || (rgl->factor == 1 && factor == 1)
10733                   || (rgl->max_nscalars_per_iter * rgl->factor
10734                       == nscalars_per_iter * factor));
10735       rgl->max_nscalars_per_iter = nscalars_per_iter;
10736       rgl->type = vectype;
10737       rgl->factor = factor;
10738     }
10739 }
10740
10741 /* Given a complete set of lengths LENS, extract length number INDEX
10742    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10743    where 0 <= INDEX < NVECTORS.  Return a value that contains FACTOR
10744    multipled by the number of elements that should be processed.
10745    Insert any set-up statements before GSI.  */
10746
10747 tree
10748 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10749                    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
10750                    unsigned int index, unsigned int factor)
10751 {
10752   rgroup_controls *rgl = &(*lens)[nvectors - 1];
10753   bool use_bias_adjusted_len =
10754     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10755
10756   /* Populate the rgroup's len array, if this is the first time we've
10757      used it.  */
10758   if (rgl->controls.is_empty ())
10759     {
10760       rgl->controls.safe_grow_cleared (nvectors, true);
10761       for (unsigned int i = 0; i < nvectors; ++i)
10762         {
10763           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10764           gcc_assert (len_type != NULL_TREE);
10765
10766           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10767
10768           /* Provide a dummy definition until the real one is available.  */
10769           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10770           rgl->controls[i] = len;
10771
10772           if (use_bias_adjusted_len)
10773             {
10774               gcc_assert (i == 0);
10775               tree adjusted_len =
10776                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10777               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10778               rgl->bias_adjusted_ctrl = adjusted_len;
10779             }
10780         }
10781     }
10782
10783   if (use_bias_adjusted_len)
10784     return rgl->bias_adjusted_ctrl;
10785
10786   tree loop_len = rgl->controls[index];
10787   if (rgl->factor == 1 && factor == 1)
10788     {
10789       poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
10790       poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
10791       if (maybe_ne (nunits1, nunits2))
10792         {
10793           /* A loop len for data type X can be reused for data type Y
10794              if X has N times more elements than Y and if Y's elements
10795              are N times bigger than X's.  */
10796           gcc_assert (multiple_p (nunits1, nunits2));
10797           factor = exact_div (nunits1, nunits2).to_constant ();
10798           tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10799           gimple_seq seq = NULL;
10800           loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
10801                                    build_int_cst (iv_type, factor));
10802           if (seq)
10803             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10804         }
10805     }
10806   return loop_len;
10807 }
10808
10809 /* Scale profiling counters by estimation for LOOP which is vectorized
10810    by factor VF.  */
10811
10812 static void
10813 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
10814 {
10815   edge preheader = loop_preheader_edge (loop);
10816   /* Reduce loop iterations by the vectorization factor.  */
10817   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
10818   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
10819
10820   if (freq_h.nonzero_p ())
10821     {
10822       profile_probability p;
10823
10824       /* Avoid dropping loop body profile counter to 0 because of zero count
10825          in loop's preheader.  */
10826       if (!(freq_e == profile_count::zero ()))
10827         freq_e = freq_e.force_nonzero ();
10828       p = (freq_e * (new_est_niter + 1)).probability_in (freq_h);
10829       scale_loop_frequencies (loop, p);
10830     }
10831
10832   edge exit_e = single_exit (loop);
10833   exit_e->probability = profile_probability::always () / (new_est_niter + 1);
10834
10835   edge exit_l = single_pred_edge (loop->latch);
10836   profile_probability prob = exit_l->probability;
10837   exit_l->probability = exit_e->probability.invert ();
10838   if (prob.initialized_p () && exit_l->probability.initialized_p ())
10839     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
10840 }
10841
10842 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
10843    latch edge values originally defined by it.  */
10844
10845 static void
10846 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
10847                                      stmt_vec_info def_stmt_info)
10848 {
10849   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
10850   if (!def || TREE_CODE (def) != SSA_NAME)
10851     return;
10852   stmt_vec_info phi_info;
10853   imm_use_iterator iter;
10854   use_operand_p use_p;
10855   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
10856     {
10857       gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
10858       if (!phi)
10859         continue;
10860       if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
10861             && (phi_info = loop_vinfo->lookup_stmt (phi))
10862             && STMT_VINFO_RELEVANT_P (phi_info)))
10863         continue;
10864       loop_p loop = gimple_bb (phi)->loop_father;
10865       edge e = loop_latch_edge (loop);
10866       if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
10867         continue;
10868
10869       if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
10870           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
10871           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
10872         {
10873           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
10874           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
10875           gcc_assert (phi_defs.length () == latch_defs.length ());
10876           for (unsigned i = 0; i < phi_defs.length (); ++i)
10877             add_phi_arg (as_a <gphi *> (phi_defs[i]),
10878                          gimple_get_lhs (latch_defs[i]), e,
10879                          gimple_phi_arg_location (phi, e->dest_idx));
10880         }
10881       else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
10882         {
10883           /* For first order recurrences we have to update both uses of
10884              the latch definition, the one in the PHI node and the one
10885              in the generated VEC_PERM_EXPR.  */
10886           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
10887           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
10888           gcc_assert (phi_defs.length () == latch_defs.length ());
10889           tree phidef = gimple_assign_rhs1 (phi_defs[0]);
10890           gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
10891           for (unsigned i = 0; i < phi_defs.length (); ++i)
10892             {
10893               gassign *perm = as_a <gassign *> (phi_defs[i]);
10894               if (i > 0)
10895                 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
10896               gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
10897               update_stmt (perm);
10898             }
10899           add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
10900                        gimple_phi_arg_location (phi, e->dest_idx));
10901         }
10902     }
10903 }
10904
10905 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
10906    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
10907    stmt_vec_info.  */
10908
10909 static bool
10910 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
10911                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
10912 {
10913   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10914   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10915
10916   if (dump_enabled_p ())
10917     dump_printf_loc (MSG_NOTE, vect_location,
10918                      "------>vectorizing statement: %G", stmt_info->stmt);
10919
10920   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
10921     vect_loop_kill_debug_uses (loop, stmt_info);
10922
10923   if (!STMT_VINFO_RELEVANT_P (stmt_info)
10924       && !STMT_VINFO_LIVE_P (stmt_info))
10925     return false;
10926
10927   if (STMT_VINFO_VECTYPE (stmt_info))
10928     {
10929       poly_uint64 nunits
10930         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
10931       if (!STMT_SLP_TYPE (stmt_info)
10932           && maybe_ne (nunits, vf)
10933           && dump_enabled_p ())
10934         /* For SLP VF is set according to unrolling factor, and not
10935            to vector size, hence for SLP this print is not valid.  */
10936         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
10937     }
10938
10939   /* Pure SLP statements have already been vectorized.  We still need
10940      to apply loop vectorization to hybrid SLP statements.  */
10941   if (PURE_SLP_STMT (stmt_info))
10942     return false;
10943
10944   if (dump_enabled_p ())
10945     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
10946
10947   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
10948     *seen_store = stmt_info;
10949
10950   return true;
10951 }
10952
10953 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
10954    in the hash_map with its corresponding values.  */
10955
10956 static tree
10957 find_in_mapping (tree t, void *context)
10958 {
10959   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
10960
10961   tree *value = mapping->get (t);
10962   return value ? *value : t;
10963 }
10964
10965 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
10966    original loop that has now been vectorized.
10967
10968    The inits of the data_references need to be advanced with the number of
10969    iterations of the main loop.  This has been computed in vect_do_peeling and
10970    is stored in parameter ADVANCE.  We first restore the data_references
10971    initial offset with the values recored in ORIG_DRS_INIT.
10972
10973    Since the loop_vec_info of this EPILOGUE was constructed for the original
10974    loop, its stmt_vec_infos all point to the original statements.  These need
10975    to be updated to point to their corresponding copies as well as the SSA_NAMES
10976    in their PATTERN_DEF_SEQs and RELATED_STMTs.
10977
10978    The data_reference's connections also need to be updated.  Their
10979    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
10980    stmt_vec_infos, their statements need to point to their corresponding copy,
10981    if they are gather loads or scatter stores then their reference needs to be
10982    updated to point to its corresponding copy and finally we set
10983    'base_misaligned' to false as we have already peeled for alignment in the
10984    prologue of the main loop.  */
10985
10986 static void
10987 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
10988 {
10989   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
10990   auto_vec<gimple *> stmt_worklist;
10991   hash_map<tree,tree> mapping;
10992   gimple *orig_stmt, *new_stmt;
10993   gimple_stmt_iterator epilogue_gsi;
10994   gphi_iterator epilogue_phi_gsi;
10995   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
10996   basic_block *epilogue_bbs = get_loop_body (epilogue);
10997   unsigned i;
10998
10999   free (LOOP_VINFO_BBS (epilogue_vinfo));
11000   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11001
11002   /* Advance data_reference's with the number of iterations of the previous
11003      loop and its prologue.  */
11004   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11005
11006
11007   /* The EPILOGUE loop is a copy of the original loop so they share the same
11008      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
11009      point to the copied statements.  We also create a mapping of all LHS' in
11010      the original loop and all the LHS' in the EPILOGUE and create worklists to
11011      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
11012   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11013     {
11014       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11015            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11016         {
11017           new_stmt = epilogue_phi_gsi.phi ();
11018
11019           gcc_assert (gimple_uid (new_stmt) > 0);
11020           stmt_vinfo
11021             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11022
11023           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11024           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11025
11026           mapping.put (gimple_phi_result (orig_stmt),
11027                        gimple_phi_result (new_stmt));
11028           /* PHI nodes can not have patterns or related statements.  */
11029           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11030                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11031         }
11032
11033       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11034            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11035         {
11036           new_stmt = gsi_stmt (epilogue_gsi);
11037           if (is_gimple_debug (new_stmt))
11038             continue;
11039
11040           gcc_assert (gimple_uid (new_stmt) > 0);
11041           stmt_vinfo
11042             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11043
11044           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11045           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11046
11047           if (tree old_lhs = gimple_get_lhs (orig_stmt))
11048             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11049
11050           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11051             {
11052               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11053               for (gimple_stmt_iterator gsi = gsi_start (seq);
11054                    !gsi_end_p (gsi); gsi_next (&gsi))
11055                 stmt_worklist.safe_push (gsi_stmt (gsi));
11056             }
11057
11058           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11059           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11060             {
11061               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11062               stmt_worklist.safe_push (stmt);
11063               /* Set BB such that the assert in
11064                 'get_initial_def_for_reduction' is able to determine that
11065                 the BB of the related stmt is inside this loop.  */
11066               gimple_set_bb (stmt,
11067                              gimple_bb (new_stmt));
11068               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11069               gcc_assert (related_vinfo == NULL
11070                           || related_vinfo == stmt_vinfo);
11071             }
11072         }
11073     }
11074
11075   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11076      using the original main loop and thus need to be updated to refer to the
11077      cloned variables used in the epilogue.  */
11078   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11079     {
11080       gimple *stmt = stmt_worklist[i];
11081       tree *new_op;
11082
11083       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11084         {
11085           tree op = gimple_op (stmt, j);
11086           if ((new_op = mapping.get(op)))
11087             gimple_set_op (stmt, j, *new_op);
11088           else
11089             {
11090               /* PR92429: The last argument of simplify_replace_tree disables
11091                  folding when replacing arguments.  This is required as
11092                  otherwise you might end up with different statements than the
11093                  ones analyzed in vect_loop_analyze, leading to different
11094                  vectorization.  */
11095               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11096                                           &find_in_mapping, &mapping, false);
11097               gimple_set_op (stmt, j, op);
11098             }
11099         }
11100     }
11101
11102   struct data_reference *dr;
11103   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11104   FOR_EACH_VEC_ELT (datarefs, i, dr)
11105     {
11106       orig_stmt = DR_STMT (dr);
11107       gcc_assert (gimple_uid (orig_stmt) > 0);
11108       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11109       /* Data references for gather loads and scatter stores do not use the
11110          updated offset we set using ADVANCE.  Instead we have to make sure the
11111          reference in the data references point to the corresponding copy of
11112          the original in the epilogue.  */
11113       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
11114           == VMAT_GATHER_SCATTER)
11115         {
11116           DR_REF (dr)
11117             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11118                                      &find_in_mapping, &mapping);
11119           DR_BASE_ADDRESS (dr)
11120             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11121                                      &find_in_mapping, &mapping);
11122         }
11123       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11124       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11125       /* The vector size of the epilogue is smaller than that of the main loop
11126          so the alignment is either the same or lower. This means the dr will
11127          thus by definition be aligned.  */
11128       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11129     }
11130
11131   epilogue_vinfo->shared->datarefs_copy.release ();
11132   epilogue_vinfo->shared->save_datarefs ();
11133 }
11134
11135 /* Function vect_transform_loop.
11136
11137    The analysis phase has determined that the loop is vectorizable.
11138    Vectorize the loop - created vectorized stmts to replace the scalar
11139    stmts in the loop, and update the loop exit condition.
11140    Returns scalar epilogue loop if any.  */
11141
11142 class loop *
11143 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11144 {
11145   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11146   class loop *epilogue = NULL;
11147   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11148   int nbbs = loop->num_nodes;
11149   int i;
11150   tree niters_vector = NULL_TREE;
11151   tree step_vector = NULL_TREE;
11152   tree niters_vector_mult_vf = NULL_TREE;
11153   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11154   unsigned int lowest_vf = constant_lower_bound (vf);
11155   gimple *stmt;
11156   bool check_profitability = false;
11157   unsigned int th;
11158
11159   DUMP_VECT_SCOPE ("vec_transform_loop");
11160
11161   loop_vinfo->shared->check_datarefs ();
11162
11163   /* Use the more conservative vectorization threshold.  If the number
11164      of iterations is constant assume the cost check has been performed
11165      by our caller.  If the threshold makes all loops profitable that
11166      run at least the (estimated) vectorization factor number of times
11167      checking is pointless, too.  */
11168   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11169   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11170     {
11171       if (dump_enabled_p ())
11172         dump_printf_loc (MSG_NOTE, vect_location,
11173                          "Profitability threshold is %d loop iterations.\n",
11174                          th);
11175       check_profitability = true;
11176     }
11177
11178   /* Make sure there exists a single-predecessor exit bb.  Do this before
11179      versioning.   */
11180   edge e = single_exit (loop);
11181   if (! single_pred_p (e->dest))
11182     {
11183       split_loop_exit_edge (e, true);
11184       if (dump_enabled_p ())
11185         dump_printf (MSG_NOTE, "split exit edge\n");
11186     }
11187
11188   /* Version the loop first, if required, so the profitability check
11189      comes first.  */
11190
11191   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11192     {
11193       class loop *sloop
11194         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11195       sloop->force_vectorize = false;
11196       check_profitability = false;
11197     }
11198
11199   /* Make sure there exists a single-predecessor exit bb also on the
11200      scalar loop copy.  Do this after versioning but before peeling
11201      so CFG structure is fine for both scalar and if-converted loop
11202      to make slpeel_duplicate_current_defs_from_edges face matched
11203      loop closed PHI nodes on the exit.  */
11204   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11205     {
11206       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
11207       if (! single_pred_p (e->dest))
11208         {
11209           split_loop_exit_edge (e, true);
11210           if (dump_enabled_p ())
11211             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11212         }
11213     }
11214
11215   tree niters = vect_build_loop_niters (loop_vinfo);
11216   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11217   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11218   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11219   tree advance;
11220   drs_init_vec orig_drs_init;
11221
11222   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11223                               &step_vector, &niters_vector_mult_vf, th,
11224                               check_profitability, niters_no_overflow,
11225                               &advance);
11226
11227   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11228       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11229     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11230                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11231
11232   if (niters_vector == NULL_TREE)
11233     {
11234       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11235           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11236           && known_eq (lowest_vf, vf))
11237         {
11238           niters_vector
11239             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11240                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11241           step_vector = build_one_cst (TREE_TYPE (niters));
11242         }
11243       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11244         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11245                                      &step_vector, niters_no_overflow);
11246       else
11247         /* vect_do_peeling subtracted the number of peeled prologue
11248            iterations from LOOP_VINFO_NITERS.  */
11249         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11250                                      &niters_vector, &step_vector,
11251                                      niters_no_overflow);
11252     }
11253
11254   /* 1) Make sure the loop header has exactly two entries
11255      2) Make sure we have a preheader basic block.  */
11256
11257   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11258
11259   split_edge (loop_preheader_edge (loop));
11260
11261   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11262     /* This will deal with any possible peeling.  */
11263     vect_prepare_for_masked_peels (loop_vinfo);
11264
11265   /* Schedule the SLP instances first, then handle loop vectorization
11266      below.  */
11267   if (!loop_vinfo->slp_instances.is_empty ())
11268     {
11269       DUMP_VECT_SCOPE ("scheduling SLP instances");
11270       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11271     }
11272
11273   /* FORNOW: the vectorizer supports only loops which body consist
11274      of one basic block (header + empty latch). When the vectorizer will
11275      support more involved loop forms, the order by which the BBs are
11276      traversed need to be reconsidered.  */
11277
11278   for (i = 0; i < nbbs; i++)
11279     {
11280       basic_block bb = bbs[i];
11281       stmt_vec_info stmt_info;
11282
11283       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11284            gsi_next (&si))
11285         {
11286           gphi *phi = si.phi ();
11287           if (dump_enabled_p ())
11288             dump_printf_loc (MSG_NOTE, vect_location,
11289                              "------>vectorizing phi: %G", (gimple *) phi);
11290           stmt_info = loop_vinfo->lookup_stmt (phi);
11291           if (!stmt_info)
11292             continue;
11293
11294           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11295             vect_loop_kill_debug_uses (loop, stmt_info);
11296
11297           if (!STMT_VINFO_RELEVANT_P (stmt_info)
11298               && !STMT_VINFO_LIVE_P (stmt_info))
11299             continue;
11300
11301           if (STMT_VINFO_VECTYPE (stmt_info)
11302               && (maybe_ne
11303                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
11304               && dump_enabled_p ())
11305             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11306
11307           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11308                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11309                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11310                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11311                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
11312                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
11313               && ! PURE_SLP_STMT (stmt_info))
11314             {
11315               if (dump_enabled_p ())
11316                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
11317               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
11318             }
11319         }
11320
11321       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11322            gsi_next (&si))
11323         {
11324           gphi *phi = si.phi ();
11325           stmt_info = loop_vinfo->lookup_stmt (phi);
11326           if (!stmt_info)
11327             continue;
11328
11329           if (!STMT_VINFO_RELEVANT_P (stmt_info)
11330               && !STMT_VINFO_LIVE_P (stmt_info))
11331             continue;
11332
11333           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11334                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11335                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11336                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11337                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
11338                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
11339               && ! PURE_SLP_STMT (stmt_info))
11340             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
11341         }
11342
11343       for (gimple_stmt_iterator si = gsi_start_bb (bb);
11344            !gsi_end_p (si);)
11345         {
11346           stmt = gsi_stmt (si);
11347           /* During vectorization remove existing clobber stmts.  */
11348           if (gimple_clobber_p (stmt))
11349             {
11350               unlink_stmt_vdef (stmt);
11351               gsi_remove (&si, true);
11352               release_defs (stmt);
11353             }
11354           else
11355             {
11356               /* Ignore vector stmts created in the outer loop.  */
11357               stmt_info = loop_vinfo->lookup_stmt (stmt);
11358
11359               /* vector stmts created in the outer-loop during vectorization of
11360                  stmts in an inner-loop may not have a stmt_info, and do not
11361                  need to be vectorized.  */
11362               stmt_vec_info seen_store = NULL;
11363               if (stmt_info)
11364                 {
11365                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
11366                     {
11367                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
11368                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
11369                            !gsi_end_p (subsi); gsi_next (&subsi))
11370                         {
11371                           stmt_vec_info pat_stmt_info
11372                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
11373                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11374                                                     &si, &seen_store);
11375                         }
11376                       stmt_vec_info pat_stmt_info
11377                         = STMT_VINFO_RELATED_STMT (stmt_info);
11378                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11379                                                     &si, &seen_store))
11380                         maybe_set_vectorized_backedge_value (loop_vinfo,
11381                                                              pat_stmt_info);
11382                     }
11383                   else
11384                     {
11385                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
11386                                                     &seen_store))
11387                         maybe_set_vectorized_backedge_value (loop_vinfo,
11388                                                              stmt_info);
11389                     }
11390                 }
11391               gsi_next (&si);
11392               if (seen_store)
11393                 {
11394                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
11395                     /* Interleaving.  If IS_STORE is TRUE, the
11396                        vectorization of the interleaving chain was
11397                        completed - free all the stores in the chain.  */
11398                     vect_remove_stores (loop_vinfo,
11399                                         DR_GROUP_FIRST_ELEMENT (seen_store));
11400                   else
11401                     /* Free the attached stmt_vec_info and remove the stmt.  */
11402                     loop_vinfo->remove_stmt (stmt_info);
11403                 }
11404             }
11405         }
11406
11407       /* Stub out scalar statements that must not survive vectorization.
11408          Doing this here helps with grouped statements, or statements that
11409          are involved in patterns.  */
11410       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11411            !gsi_end_p (gsi); gsi_next (&gsi))
11412         {
11413           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11414           if (!call || !gimple_call_internal_p (call))
11415             continue;
11416           internal_fn ifn = gimple_call_internal_fn (call);
11417           if (ifn == IFN_MASK_LOAD)
11418             {
11419               tree lhs = gimple_get_lhs (call);
11420               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11421                 {
11422                   tree zero = build_zero_cst (TREE_TYPE (lhs));
11423                   gimple *new_stmt = gimple_build_assign (lhs, zero);
11424                   gsi_replace (&gsi, new_stmt, true);
11425                 }
11426             }
11427           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11428             {
11429               tree lhs = gimple_get_lhs (call);
11430               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11431                 {
11432                   tree else_arg
11433                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11434                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11435                   gsi_replace (&gsi, new_stmt, true);
11436                 }
11437             }
11438         }
11439     }                           /* BBs in loop */
11440
11441   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11442      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
11443   if (integer_onep (step_vector))
11444     niters_no_overflow = true;
11445   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
11446                            niters_vector_mult_vf, !niters_no_overflow);
11447
11448   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11449   scale_profile_for_vect_loop (loop, assumed_vf);
11450
11451   /* True if the final iteration might not handle a full vector's
11452      worth of scalar iterations.  */
11453   bool final_iter_may_be_partial
11454     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11455   /* The minimum number of iterations performed by the epilogue.  This
11456      is 1 when peeling for gaps because we always need a final scalar
11457      iteration.  */
11458   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11459   /* +1 to convert latch counts to loop iteration counts,
11460      -min_epilogue_iters to remove iterations that cannot be performed
11461        by the vector code.  */
11462   int bias_for_lowest = 1 - min_epilogue_iters;
11463   int bias_for_assumed = bias_for_lowest;
11464   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11465   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11466     {
11467       /* When the amount of peeling is known at compile time, the first
11468          iteration will have exactly alignment_npeels active elements.
11469          In the worst case it will have at least one.  */
11470       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11471       bias_for_lowest += lowest_vf - min_first_active;
11472       bias_for_assumed += assumed_vf - min_first_active;
11473     }
11474   /* In these calculations the "- 1" converts loop iteration counts
11475      back to latch counts.  */
11476   if (loop->any_upper_bound)
11477     {
11478       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11479       loop->nb_iterations_upper_bound
11480         = (final_iter_may_be_partial
11481            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11482                             lowest_vf) - 1
11483            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11484                              lowest_vf) - 1);
11485       if (main_vinfo
11486           /* Both peeling for alignment and peeling for gaps can end up
11487              with the scalar epilogue running for more than VF-1 iterations.  */
11488           && !main_vinfo->peeling_for_alignment
11489           && !main_vinfo->peeling_for_gaps)
11490         {
11491           unsigned int bound;
11492           poly_uint64 main_iters
11493             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11494                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11495           main_iters
11496             = upper_bound (main_iters,
11497                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11498           if (can_div_away_from_zero_p (main_iters,
11499                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11500                                         &bound))
11501             loop->nb_iterations_upper_bound
11502               = wi::umin ((widest_int) (bound - 1),
11503                           loop->nb_iterations_upper_bound);
11504       }
11505   }
11506   if (loop->any_likely_upper_bound)
11507     loop->nb_iterations_likely_upper_bound
11508       = (final_iter_may_be_partial
11509          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11510                           + bias_for_lowest, lowest_vf) - 1
11511          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11512                            + bias_for_lowest, lowest_vf) - 1);
11513   if (loop->any_estimate)
11514     loop->nb_iterations_estimate
11515       = (final_iter_may_be_partial
11516          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11517                           assumed_vf) - 1
11518          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11519                            assumed_vf) - 1);
11520
11521   if (dump_enabled_p ())
11522     {
11523       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11524         {
11525           dump_printf_loc (MSG_NOTE, vect_location,
11526                            "LOOP VECTORIZED\n");
11527           if (loop->inner)
11528             dump_printf_loc (MSG_NOTE, vect_location,
11529                              "OUTER LOOP VECTORIZED\n");
11530           dump_printf (MSG_NOTE, "\n");
11531         }
11532       else
11533         dump_printf_loc (MSG_NOTE, vect_location,
11534                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11535                          GET_MODE_NAME (loop_vinfo->vector_mode));
11536     }
11537
11538   /* Loops vectorized with a variable factor won't benefit from
11539      unrolling/peeling.  */
11540   if (!vf.is_constant ())
11541     {
11542       loop->unroll = 1;
11543       if (dump_enabled_p ())
11544         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11545                          " variable-length vectorization factor\n");
11546     }
11547   /* Free SLP instances here because otherwise stmt reference counting
11548      won't work.  */
11549   slp_instance instance;
11550   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11551     vect_free_slp_instance (instance);
11552   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11553   /* Clear-up safelen field since its value is invalid after vectorization
11554      since vectorized loop can have loop-carried dependencies.  */
11555   loop->safelen = 0;
11556
11557   if (epilogue)
11558     {
11559       update_epilogue_loop_vinfo (epilogue, advance);
11560
11561       epilogue->simduid = loop->simduid;
11562       epilogue->force_vectorize = loop->force_vectorize;
11563       epilogue->dont_vectorize = false;
11564     }
11565
11566   return epilogue;
11567 }
11568
11569 /* The code below is trying to perform simple optimization - revert
11570    if-conversion for masked stores, i.e. if the mask of a store is zero
11571    do not perform it and all stored value producers also if possible.
11572    For example,
11573      for (i=0; i<n; i++)
11574        if (c[i])
11575         {
11576           p1[i] += 1;
11577           p2[i] = p3[i] +2;
11578         }
11579    this transformation will produce the following semi-hammock:
11580
11581    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11582      {
11583        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11584        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11585        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11586        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11587        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11588        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11589      }
11590 */
11591
11592 void
11593 optimize_mask_stores (class loop *loop)
11594 {
11595   basic_block *bbs = get_loop_body (loop);
11596   unsigned nbbs = loop->num_nodes;
11597   unsigned i;
11598   basic_block bb;
11599   class loop *bb_loop;
11600   gimple_stmt_iterator gsi;
11601   gimple *stmt;
11602   auto_vec<gimple *> worklist;
11603   auto_purge_vect_location sentinel;
11604
11605   vect_location = find_loop_location (loop);
11606   /* Pick up all masked stores in loop if any.  */
11607   for (i = 0; i < nbbs; i++)
11608     {
11609       bb = bbs[i];
11610       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11611            gsi_next (&gsi))
11612         {
11613           stmt = gsi_stmt (gsi);
11614           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11615             worklist.safe_push (stmt);
11616         }
11617     }
11618
11619   free (bbs);
11620   if (worklist.is_empty ())
11621     return;
11622
11623   /* Loop has masked stores.  */
11624   while (!worklist.is_empty ())
11625     {
11626       gimple *last, *last_store;
11627       edge e, efalse;
11628       tree mask;
11629       basic_block store_bb, join_bb;
11630       gimple_stmt_iterator gsi_to;
11631       tree vdef, new_vdef;
11632       gphi *phi;
11633       tree vectype;
11634       tree zero;
11635
11636       last = worklist.pop ();
11637       mask = gimple_call_arg (last, 2);
11638       bb = gimple_bb (last);
11639       /* Create then_bb and if-then structure in CFG, then_bb belongs to
11640          the same loop as if_bb.  It could be different to LOOP when two
11641          level loop-nest is vectorized and mask_store belongs to the inner
11642          one.  */
11643       e = split_block (bb, last);
11644       bb_loop = bb->loop_father;
11645       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11646       join_bb = e->dest;
11647       store_bb = create_empty_bb (bb);
11648       add_bb_to_loop (store_bb, bb_loop);
11649       e->flags = EDGE_TRUE_VALUE;
11650       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11651       /* Put STORE_BB to likely part.  */
11652       efalse->probability = profile_probability::unlikely ();
11653       store_bb->count = efalse->count ();
11654       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11655       if (dom_info_available_p (CDI_DOMINATORS))
11656         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11657       if (dump_enabled_p ())
11658         dump_printf_loc (MSG_NOTE, vect_location,
11659                          "Create new block %d to sink mask stores.",
11660                          store_bb->index);
11661       /* Create vector comparison with boolean result.  */
11662       vectype = TREE_TYPE (mask);
11663       zero = build_zero_cst (vectype);
11664       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11665       gsi = gsi_last_bb (bb);
11666       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11667       /* Create new PHI node for vdef of the last masked store:
11668          .MEM_2 = VDEF <.MEM_1>
11669          will be converted to
11670          .MEM.3 = VDEF <.MEM_1>
11671          and new PHI node will be created in join bb
11672          .MEM_2 = PHI <.MEM_1, .MEM_3>
11673       */
11674       vdef = gimple_vdef (last);
11675       new_vdef = make_ssa_name (gimple_vop (cfun), last);
11676       gimple_set_vdef (last, new_vdef);
11677       phi = create_phi_node (vdef, join_bb);
11678       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11679
11680       /* Put all masked stores with the same mask to STORE_BB if possible.  */
11681       while (true)
11682         {
11683           gimple_stmt_iterator gsi_from;
11684           gimple *stmt1 = NULL;
11685
11686           /* Move masked store to STORE_BB.  */
11687           last_store = last;
11688           gsi = gsi_for_stmt (last);
11689           gsi_from = gsi;
11690           /* Shift GSI to the previous stmt for further traversal.  */
11691           gsi_prev (&gsi);
11692           gsi_to = gsi_start_bb (store_bb);
11693           gsi_move_before (&gsi_from, &gsi_to);
11694           /* Setup GSI_TO to the non-empty block start.  */
11695           gsi_to = gsi_start_bb (store_bb);
11696           if (dump_enabled_p ())
11697             dump_printf_loc (MSG_NOTE, vect_location,
11698                              "Move stmt to created bb\n%G", last);
11699           /* Move all stored value producers if possible.  */
11700           while (!gsi_end_p (gsi))
11701             {
11702               tree lhs;
11703               imm_use_iterator imm_iter;
11704               use_operand_p use_p;
11705               bool res;
11706
11707               /* Skip debug statements.  */
11708               if (is_gimple_debug (gsi_stmt (gsi)))
11709                 {
11710                   gsi_prev (&gsi);
11711                   continue;
11712                 }
11713               stmt1 = gsi_stmt (gsi);
11714               /* Do not consider statements writing to memory or having
11715                  volatile operand.  */
11716               if (gimple_vdef (stmt1)
11717                   || gimple_has_volatile_ops (stmt1))
11718                 break;
11719               gsi_from = gsi;
11720               gsi_prev (&gsi);
11721               lhs = gimple_get_lhs (stmt1);
11722               if (!lhs)
11723                 break;
11724
11725               /* LHS of vectorized stmt must be SSA_NAME.  */
11726               if (TREE_CODE (lhs) != SSA_NAME)
11727                 break;
11728
11729               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11730                 {
11731                   /* Remove dead scalar statement.  */
11732                   if (has_zero_uses (lhs))
11733                     {
11734                       gsi_remove (&gsi_from, true);
11735                       continue;
11736                     }
11737                 }
11738
11739               /* Check that LHS does not have uses outside of STORE_BB.  */
11740               res = true;
11741               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11742                 {
11743                   gimple *use_stmt;
11744                   use_stmt = USE_STMT (use_p);
11745                   if (is_gimple_debug (use_stmt))
11746                     continue;
11747                   if (gimple_bb (use_stmt) != store_bb)
11748                     {
11749                       res = false;
11750                       break;
11751                     }
11752                 }
11753               if (!res)
11754                 break;
11755
11756               if (gimple_vuse (stmt1)
11757                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
11758                 break;
11759
11760               /* Can move STMT1 to STORE_BB.  */
11761               if (dump_enabled_p ())
11762                 dump_printf_loc (MSG_NOTE, vect_location,
11763                                  "Move stmt to created bb\n%G", stmt1);
11764               gsi_move_before (&gsi_from, &gsi_to);
11765               /* Shift GSI_TO for further insertion.  */
11766               gsi_prev (&gsi_to);
11767             }
11768           /* Put other masked stores with the same mask to STORE_BB.  */
11769           if (worklist.is_empty ()
11770               || gimple_call_arg (worklist.last (), 2) != mask
11771               || worklist.last () != stmt1)
11772             break;
11773           last = worklist.pop ();
11774         }
11775       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11776     }
11777 }
11778
11779 /* Decide whether it is possible to use a zero-based induction variable
11780    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
11781    the value that the induction variable must be able to hold in order
11782    to ensure that the rgroups eventually have no active vector elements.
11783    Return -1 otherwise.  */
11784
11785 widest_int
11786 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11787 {
11788   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11789   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11790   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11791
11792   /* Calculate the value that the induction variable must be able
11793      to hit in order to ensure that we end the loop with an all-false mask.
11794      This involves adding the maximum number of inactive trailing scalar
11795      iterations.  */
11796   widest_int iv_limit = -1;
11797   if (max_loop_iterations (loop, &iv_limit))
11798     {
11799       if (niters_skip)
11800         {
11801           /* Add the maximum number of skipped iterations to the
11802              maximum iteration count.  */
11803           if (TREE_CODE (niters_skip) == INTEGER_CST)
11804             iv_limit += wi::to_widest (niters_skip);
11805           else
11806             iv_limit += max_vf - 1;
11807         }
11808       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11809         /* Make a conservatively-correct assumption.  */
11810         iv_limit += max_vf - 1;
11811
11812       /* IV_LIMIT is the maximum number of latch iterations, which is also
11813          the maximum in-range IV value.  Round this value down to the previous
11814          vector alignment boundary and then add an extra full iteration.  */
11815       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11816       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
11817     }
11818   return iv_limit;
11819 }
11820
11821 /* For the given rgroup_controls RGC, check whether an induction variable
11822    would ever hit a value that produces a set of all-false masks or zero
11823    lengths before wrapping around.  Return true if it's possible to wrap
11824    around before hitting the desirable value, otherwise return false.  */
11825
11826 bool
11827 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
11828 {
11829   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
11830
11831   if (iv_limit == -1)
11832     return true;
11833
11834   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11835   unsigned int compare_precision = TYPE_PRECISION (compare_type);
11836   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
11837
11838   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
11839     return true;
11840
11841   return false;
11842 }