gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2024 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "memmodel.h"
  36 #include "optabs.h"
  37 #include "diagnostic-core.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "cfganal.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop-niter.h"
  47 #include "tree-ssa-loop.h"
  48 #include "cfgloop.h"
  49 #include "tree-scalar-evolution.h"
  50 #include "tree-vectorizer.h"
  51 #include "gimple-fold.h"
  52 #include "cgraph.h"
  53 #include "tree-cfg.h"
  54 #include "tree-if-conv.h"
  55 #include "internal-fn.h"
  56 #include "tree-vector-builder.h"
  57 #include "vec-perm-indices.h"
  58 #include "tree-eh.h"
  59 #include "case-cfn-macros.h"
  60 #include "langhooks.h"
  61
  62 /* Loop Vectorization Pass.
  63
  64    This pass tries to vectorize loops.
  65
  66    For example, the vectorizer transforms the following simple loop:
  67
  68         short a[N]; short b[N]; short c[N]; int i;
  69
  70         for (i=0; i<N; i++){
  71           a[i] = b[i] + c[i];
  72         }
  73
  74    as if it was manually vectorized by rewriting the source code into:
  75
  76         typedef int __attribute__((mode(V8HI))) v8hi;
  77         short a[N];  short b[N]; short c[N];   int i;
  78         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  79         v8hi va, vb, vc;
  80
  81         for (i=0; i<N/8; i++){
  82           vb = pb[i];
  83           vc = pc[i];
  84           va = vb + vc;
  85           pa[i] = va;
  86         }
  87
  88         The main entry to this pass is vectorize_loops(), in which
  89    the vectorizer applies a set of analyses on a given set of loops,
  90    followed by the actual vectorization transformation for the loops that
  91    had successfully passed the analysis phase.
  92         Throughout this pass we make a distinction between two types of
  93    data: scalars (which are represented by SSA_NAMES), and memory references
  94    ("data-refs").  These two types of data require different handling both
  95    during analysis and transformation. The types of data-refs that the
  96    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  97    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  98    accesses are required to have a simple (consecutive) access pattern.
  99
 100    Analysis phase:
 101    ===============
 102         The driver for the analysis phase is vect_analyze_loop().
 103    It applies a set of analyses, some of which rely on the scalar evolution
 104    analyzer (scev) developed by Sebastian Pop.
 105
 106         During the analysis phase the vectorizer records some information
 107    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 108    loop, as well as general information about the loop as a whole, which is
 109    recorded in a "loop_vec_info" struct attached to each loop.
 110
 111    Transformation phase:
 112    =====================
 113         The loop transformation phase scans all the stmts in the loop, and
 114    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 115    the loop that needs to be vectorized.  It inserts the vector code sequence
 116    just before the scalar stmt S, and records a pointer to the vector code
 117    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 118    attached to S).  This pointer will be used for the vectorization of following
 119    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 120    otherwise, we rely on dead code elimination for removing it.
 121
 122         For example, say stmt S1 was vectorized into stmt VS1:
 123
 124    VS1: vb = px[i];
 125    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 126    S2:  a = b;
 127
 128    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 129    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 130    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 131    resulting sequence would be:
 132
 133    VS1: vb = px[i];
 134    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 135    VS2: va = vb;
 136    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 137
 138         Operands that are not SSA_NAMEs, are data-refs that appear in
 139    load/store operations (like 'x[i]' in S1), and are handled differently.
 140
 141    Target modeling:
 142    =================
 143         Currently the only target specific information that is used is the
 144    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 145    Targets that can support different sizes of vectors, for now will need
 146    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 147    flexibility will be added in the future.
 148
 149         Since we only vectorize operations which vector form can be
 150    expressed using existing tree codes, to verify that an operation is
 151    supported, the vectorizer checks the relevant optab at the relevant
 152    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 153    the value found is CODE_FOR_nothing, then there's no target support, and
 154    we can't vectorize the stmt.
 155
 156    For additional information on this project see:
 157    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 158 */
 159
 160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 161                                                 unsigned *);
 162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 163                                                bool *, bool *, bool);
 164
 165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 166    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 167    may already be set for general statements (not just data refs).  */
 168
 169 static opt_result
 170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 171                               bool vectype_maybe_set_p,
 172                               poly_uint64 *vf)
 173 {
 174   gimple *stmt = stmt_info->stmt;
 175
 176   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 177        && !STMT_VINFO_LIVE_P (stmt_info))
 178       || gimple_clobber_p (stmt))
 179     {
 180       if (dump_enabled_p ())
 181         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 182       return opt_result::success ();
 183     }
 184
 185   tree stmt_vectype, nunits_vectype;
 186   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 187                                                    &stmt_vectype,
 188                                                    &nunits_vectype);
 189   if (!res)
 190     return res;
 191
 192   if (stmt_vectype)
 193     {
 194       if (STMT_VINFO_VECTYPE (stmt_info))
 195         /* The only case when a vectype had been already set is for stmts
 196            that contain a data ref, or for "pattern-stmts" (stmts generated
 197            by the vectorizer to represent/replace a certain idiom).  */
 198         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 199                      || vectype_maybe_set_p)
 200                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 201       else
 202         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 203     }
 204
 205   if (nunits_vectype)
 206     vect_update_max_nunits (vf, nunits_vectype);
 207
 208   return opt_result::success ();
 209 }
 210
 211 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 212    types of STMT_INFO and all attached pattern statements and update
 213    the vectorization factor VF accordingly.  Return true on success
 214    or false if something prevented vectorization.  */
 215
 216 static opt_result
 217 vect_determine_vf_for_stmt (vec_info *vinfo,
 218                             stmt_vec_info stmt_info, poly_uint64 *vf)
 219 {
 220   if (dump_enabled_p ())
 221     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 222                      stmt_info->stmt);
 223   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 224   if (!res)
 225     return res;
 226
 227   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 228       && STMT_VINFO_RELATED_STMT (stmt_info))
 229     {
 230       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 231       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 232
 233       /* If a pattern statement has def stmts, analyze them too.  */
 234       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 235            !gsi_end_p (si); gsi_next (&si))
 236         {
 237           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 238           if (dump_enabled_p ())
 239             dump_printf_loc (MSG_NOTE, vect_location,
 240                              "==> examining pattern def stmt: %G",
 241                              def_stmt_info->stmt);
 242           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 243           if (!res)
 244             return res;
 245         }
 246
 247       if (dump_enabled_p ())
 248         dump_printf_loc (MSG_NOTE, vect_location,
 249                          "==> examining pattern statement: %G",
 250                          stmt_info->stmt);
 251       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 252       if (!res)
 253         return res;
 254     }
 255
 256   return opt_result::success ();
 257 }
 258
 259 /* Function vect_determine_vectorization_factor
 260
 261    Determine the vectorization factor (VF).  VF is the number of data elements
 262    that are operated upon in parallel in a single iteration of the vectorized
 263    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 264    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 265    elements can fit in a single vector register.
 266
 267    We currently support vectorization of loops in which all types operated upon
 268    are of the same size.  Therefore this function currently sets VF according to
 269    the size of the types operated upon, and fails if there are multiple sizes
 270    in the loop.
 271
 272    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 273    original loop:
 274         for (i=0; i<N; i++){
 275           a[i] = b[i] + c[i];
 276         }
 277
 278    vectorized loop:
 279         for (i=0; i<N; i+=VF){
 280           a[i:VF] = b[i:VF] + c[i:VF];
 281         }
 282 */
 283
 284 static opt_result
 285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 286 {
 287   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 288   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 289   unsigned nbbs = loop->num_nodes;
 290   poly_uint64 vectorization_factor = 1;
 291   tree scalar_type = NULL_TREE;
 292   gphi *phi;
 293   tree vectype;
 294   stmt_vec_info stmt_info;
 295   unsigned i;
 296
 297   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 298
 299   for (i = 0; i < nbbs; i++)
 300     {
 301       basic_block bb = bbs[i];
 302
 303       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 304            gsi_next (&si))
 305         {
 306           phi = si.phi ();
 307           stmt_info = loop_vinfo->lookup_stmt (phi);
 308           if (dump_enabled_p ())
 309             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 310                              (gimple *) phi);
 311
 312           gcc_assert (stmt_info);
 313
 314           if (STMT_VINFO_RELEVANT_P (stmt_info)
 315               || STMT_VINFO_LIVE_P (stmt_info))
 316             {
 317               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 318               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 319
 320               if (dump_enabled_p ())
 321                 dump_printf_loc (MSG_NOTE, vect_location,
 322                                  "get vectype for scalar type:  %T\n",
 323                                  scalar_type);
 324
 325               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 326               if (!vectype)
 327                 return opt_result::failure_at (phi,
 328                                                "not vectorized: unsupported "
 329                                                "data-type %T\n",
 330                                                scalar_type);
 331               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 332
 333               if (dump_enabled_p ())
 334                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 335                                  vectype);
 336
 337               if (dump_enabled_p ())
 338                 {
 339                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 340                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 341                   dump_printf (MSG_NOTE, "\n");
 342                 }
 343
 344               vect_update_max_nunits (&vectorization_factor, vectype);
 345             }
 346         }
 347
 348       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 349            gsi_next (&si))
 350         {
 351           if (is_gimple_debug (gsi_stmt (si)))
 352             continue;
 353           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 354           opt_result res
 355             = vect_determine_vf_for_stmt (loop_vinfo,
 356                                           stmt_info, &vectorization_factor);
 357           if (!res)
 358             return res;
 359         }
 360     }
 361
 362   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 363   if (dump_enabled_p ())
 364     {
 365       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 366       dump_dec (MSG_NOTE, vectorization_factor);
 367       dump_printf (MSG_NOTE, "\n");
 368     }
 369
 370   if (known_le (vectorization_factor, 1U))
 371     return opt_result::failure_at (vect_location,
 372                                    "not vectorized: unsupported data-type\n");
 373   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 374   return opt_result::success ();
 375 }
 376
 377
 378 /* Function vect_is_simple_iv_evolution.
 379
 380    FORNOW: A simple evolution of an induction variables in the loop is
 381    considered a polynomial evolution.  */
 382
 383 static bool
 384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 385                              tree * step)
 386 {
 387   tree init_expr;
 388   tree step_expr;
 389   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 390   basic_block bb;
 391
 392   /* When there is no evolution in this loop, the evolution function
 393      is not "simple".  */
 394   if (evolution_part == NULL_TREE)
 395     return false;
 396
 397   /* When the evolution is a polynomial of degree >= 2
 398      the evolution function is not "simple".  */
 399   if (tree_is_chrec (evolution_part))
 400     return false;
 401
 402   step_expr = evolution_part;
 403   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 404
 405   if (dump_enabled_p ())
 406     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 407                      step_expr, init_expr);
 408
 409   *init = init_expr;
 410   *step = step_expr;
 411
 412   if (TREE_CODE (step_expr) != INTEGER_CST
 413       && (TREE_CODE (step_expr) != SSA_NAME
 414           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 415               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 416           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 417               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 418                   || !flag_associative_math)))
 419       && (TREE_CODE (step_expr) != REAL_CST
 420           || !flag_associative_math))
 421     {
 422       if (dump_enabled_p ())
 423         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 424                          "step unknown.\n");
 425       return false;
 426     }
 427
 428   return true;
 429 }
 430
 431 /* Function vect_is_nonlinear_iv_evolution
 432
 433    Only support nonlinear induction for integer type
 434    1. neg
 435    2. mul by constant
 436    3. lshift/rshift by constant.
 437
 438    For neg induction, return a fake step as integer -1.  */
 439 static bool
 440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
 441                                 gphi* loop_phi_node, tree *init, tree *step)
 442 {
 443   tree init_expr, ev_expr, result, op1, op2;
 444   gimple* def;
 445
 446   if (gimple_phi_num_args (loop_phi_node) != 2)
 447     return false;
 448
 449   init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
 450   ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
 451
 452   /* Support nonlinear induction only for integer type.  */
 453   if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
 454     return false;
 455
 456   *init = init_expr;
 457   result = PHI_RESULT (loop_phi_node);
 458
 459   if (TREE_CODE (ev_expr) != SSA_NAME
 460       || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
 461       || !is_gimple_assign (def))
 462     return false;
 463
 464   enum tree_code t_code = gimple_assign_rhs_code (def);
 465   switch (t_code)
 466     {
 467     case NEGATE_EXPR:
 468       if (gimple_assign_rhs1 (def) != result)
 469         return false;
 470       *step = build_int_cst (TREE_TYPE (init_expr), -1);
 471       STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
 472       break;
 473
 474     case RSHIFT_EXPR:
 475     case LSHIFT_EXPR:
 476     case MULT_EXPR:
 477       op1 = gimple_assign_rhs1 (def);
 478       op2 = gimple_assign_rhs2 (def);
 479       if (TREE_CODE (op2) != INTEGER_CST
 480           || op1 != result)
 481         return false;
 482       *step = op2;
 483       if (t_code == LSHIFT_EXPR)
 484         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
 485       else if (t_code == RSHIFT_EXPR)
 486         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
 487       /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul.  */
 488       else
 489         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
 490       break;
 491
 492     default:
 493       return false;
 494     }
 495
 496   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
 497   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
 498
 499   return true;
 500 }
 501
 502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 503    what we are assuming is a double reduction.  For example, given
 504    a structure like this:
 505
 506       outer1:
 507         x_1 = PHI <x_4(outer2), ...>;
 508         ...
 509
 510       inner:
 511         x_2 = PHI <x_1(outer1), ...>;
 512         ...
 513         x_3 = ...;
 514         ...
 515
 516       outer2:
 517         x_4 = PHI <x_3(inner)>;
 518         ...
 519
 520    outer loop analysis would treat x_1 as a double reduction phi and
 521    this function would then return true for x_2.  */
 522
 523 static bool
 524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 525 {
 526   use_operand_p use_p;
 527   ssa_op_iter op_iter;
 528   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 529     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 530       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 531         return true;
 532   return false;
 533 }
 534
 535 /* Returns true if Phi is a first-order recurrence. A first-order
 536    recurrence is a non-reduction recurrence relation in which the value of
 537    the recurrence in the current loop iteration equals a value defined in
 538    the previous iteration.  */
 539
 540 static bool
 541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
 542                                    gphi *phi)
 543 {
 544   /* A nested cycle isn't vectorizable as first order recurrence.  */
 545   if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
 546     return false;
 547
 548   /* Ensure the loop latch definition is from within the loop.  */
 549   edge latch = loop_latch_edge (loop);
 550   tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
 551   if (TREE_CODE (ldef) != SSA_NAME
 552       || SSA_NAME_IS_DEFAULT_DEF (ldef)
 553       || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
 554       || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
 555     return false;
 556
 557   tree def = gimple_phi_result (phi);
 558
 559   /* Ensure every use_stmt of the phi node is dominated by the latch
 560      definition.  */
 561   imm_use_iterator imm_iter;
 562   use_operand_p use_p;
 563   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
 564     if (!is_gimple_debug (USE_STMT (use_p))
 565         && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
 566             || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
 567                                             USE_STMT (use_p))))
 568       return false;
 569
 570   /* First-order recurrence autovectorization needs shuffle vector.  */
 571   tree scalar_type = TREE_TYPE (def);
 572   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 573   if (!vectype)
 574     return false;
 575
 576   return true;
 577 }
 578
 579 /* Function vect_analyze_scalar_cycles_1.
 580
 581    Examine the cross iteration def-use cycles of scalar variables
 582    in LOOP.  LOOP_VINFO represents the loop that is now being
 583    considered for vectorization (can be LOOP, or an outer-loop
 584    enclosing LOOP).  SLP indicates there will be some subsequent
 585    slp analyses or not.  */
 586
 587 static void
 588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
 589                               bool slp)
 590 {
 591   basic_block bb = loop->header;
 592   tree init, step;
 593   auto_vec<stmt_vec_info, 64> worklist;
 594   gphi_iterator gsi;
 595   bool double_reduc, reduc_chain;
 596
 597   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 598
 599   /* First - identify all inductions.  Reduction detection assumes that all the
 600      inductions have been identified, therefore, this order must not be
 601      changed.  */
 602   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 603     {
 604       gphi *phi = gsi.phi ();
 605       tree access_fn = NULL;
 606       tree def = PHI_RESULT (phi);
 607       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 608
 609       if (dump_enabled_p ())
 610         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 611                          (gimple *) phi);
 612
 613       /* Skip virtual phi's.  The data dependences that are associated with
 614          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 615       if (virtual_operand_p (def))
 616         continue;
 617
 618       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 619
 620       /* Analyze the evolution function.  */
 621       access_fn = analyze_scalar_evolution (loop, def);
 622       if (access_fn)
 623         {
 624           STRIP_NOPS (access_fn);
 625           if (dump_enabled_p ())
 626             dump_printf_loc (MSG_NOTE, vect_location,
 627                              "Access function of PHI: %T\n", access_fn);
 628           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 629             = initial_condition_in_loop_num (access_fn, loop->num);
 630           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 631             = evolution_part_in_loop_num (access_fn, loop->num);
 632         }
 633
 634       if ((!access_fn
 635            || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 636            || !vect_is_simple_iv_evolution (loop->num, access_fn,
 637                                             &init, &step)
 638            || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 639                && TREE_CODE (step) != INTEGER_CST))
 640           /* Only handle nonlinear iv for same loop.  */
 641           && (LOOP_VINFO_LOOP (loop_vinfo) != loop
 642               || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
 643                                                   phi, &init, &step)))
 644         {
 645           worklist.safe_push (stmt_vinfo);
 646           continue;
 647         }
 648
 649       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 650                   != NULL_TREE);
 651       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 652
 653       if (dump_enabled_p ())
 654         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 655       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 656     }
 657
 658
 659   /* Second - identify all reductions and nested cycles.  */
 660   while (worklist.length () > 0)
 661     {
 662       stmt_vec_info stmt_vinfo = worklist.pop ();
 663       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 664       tree def = PHI_RESULT (phi);
 665
 666       if (dump_enabled_p ())
 667         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 668                          (gimple *) phi);
 669
 670       gcc_assert (!virtual_operand_p (def)
 671                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 672
 673       stmt_vec_info reduc_stmt_info
 674         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 675                                     &reduc_chain, slp);
 676       if (reduc_stmt_info)
 677         {
 678           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 679           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 680           if (double_reduc)
 681             {
 682               if (dump_enabled_p ())
 683                 dump_printf_loc (MSG_NOTE, vect_location,
 684                                  "Detected double reduction.\n");
 685
 686               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 687               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 688             }
 689           else
 690             {
 691               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 692                 {
 693                   if (dump_enabled_p ())
 694                     dump_printf_loc (MSG_NOTE, vect_location,
 695                                      "Detected vectorizable nested cycle.\n");
 696
 697                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 698                 }
 699               else
 700                 {
 701                   if (dump_enabled_p ())
 702                     dump_printf_loc (MSG_NOTE, vect_location,
 703                                      "Detected reduction.\n");
 704
 705                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 706                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 707                   /* Store the reduction cycles for possible vectorization in
 708                      loop-aware SLP if it was not detected as reduction
 709                      chain.  */
 710                   if (! reduc_chain)
 711                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 712                       (reduc_stmt_info);
 713                 }
 714             }
 715         }
 716       else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
 717         STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
 718       else
 719         if (dump_enabled_p ())
 720           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 721                            "Unknown def-use cycle pattern.\n");
 722     }
 723 }
 724
 725
 726 /* Function vect_analyze_scalar_cycles.
 727
 728    Examine the cross iteration def-use cycles of scalar variables, by
 729    analyzing the loop-header PHIs of scalar variables.  Classify each
 730    cycle as one of the following: invariant, induction, reduction, unknown.
 731    We do that for the loop represented by LOOP_VINFO, and also to its
 732    inner-loop, if exists.
 733    Examples for scalar cycles:
 734
 735    Example1: reduction:
 736
 737               loop1:
 738               for (i=0; i<N; i++)
 739                  sum += a[i];
 740
 741    Example2: induction:
 742
 743               loop2:
 744               for (i=0; i<N; i++)
 745                  a[i] = i;  */
 746
 747 static void
 748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
 749 {
 750   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 751
 752   vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
 753
 754   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 755      Reductions in such inner-loop therefore have different properties than
 756      the reductions in the nest that gets vectorized:
 757      1. When vectorized, they are executed in the same order as in the original
 758         scalar loop, so we can't change the order of computation when
 759         vectorizing them.
 760      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 761         current checks are too strict.  */
 762
 763   if (loop->inner)
 764     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
 765 }
 766
 767 /* Transfer group and reduction information from STMT_INFO to its
 768    pattern stmt.  */
 769
 770 static void
 771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 772 {
 773   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 774   stmt_vec_info stmtp;
 775   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 776               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 777   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 778   do
 779     {
 780       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 781       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 782                            == STMT_VINFO_DEF_TYPE (stmt_info));
 783       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 784       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 785       if (stmt_info)
 786         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 787           = STMT_VINFO_RELATED_STMT (stmt_info);
 788     }
 789   while (stmt_info);
 790 }
 791
 792 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 793
 794 static void
 795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 796 {
 797   stmt_vec_info first;
 798   unsigned i;
 799
 800   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 801     {
 802       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 803       while (next)
 804         {
 805           if ((STMT_VINFO_IN_PATTERN_P (next)
 806                != STMT_VINFO_IN_PATTERN_P (first))
 807               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 808             break;
 809           next = REDUC_GROUP_NEXT_ELEMENT (next);
 810         }
 811       /* If all reduction chain members are well-formed patterns adjust
 812          the group to group the pattern stmts instead.  */
 813       if (! next
 814           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 815         {
 816           if (STMT_VINFO_IN_PATTERN_P (first))
 817             {
 818               vect_fixup_reduc_chain (first);
 819               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 820                 = STMT_VINFO_RELATED_STMT (first);
 821             }
 822         }
 823       /* If not all stmt in the chain are patterns or if we failed
 824          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 825          it as regular reduction instead.  */
 826       else
 827         {
 828           stmt_vec_info vinfo = first;
 829           stmt_vec_info last = NULL;
 830           while (vinfo)
 831             {
 832               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 833               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 834               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 835               last = vinfo;
 836               vinfo = next;
 837             }
 838           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 839             = vect_internal_def;
 840           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 841           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 842           --i;
 843         }
 844     }
 845 }
 846
 847 /* Function vect_get_loop_niters.
 848
 849    Determine how many iterations the loop is executed and place it
 850    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 851    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 852    niter information holds in ASSUMPTIONS.
 853
 854    Return the loop exit conditions.  */
 855
 856
 857 static vec<gcond *>
 858 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
 859                       tree *number_of_iterations, tree *number_of_iterationsm1)
 860 {
 861   auto_vec<edge> exits = get_loop_exit_edges (loop);
 862   vec<gcond *> conds;
 863   conds.create (exits.length ());
 864   class tree_niter_desc niter_desc;
 865   tree niter_assumptions, niter, may_be_zero;
 866
 867   *assumptions = boolean_true_node;
 868   *number_of_iterationsm1 = chrec_dont_know;
 869   *number_of_iterations = chrec_dont_know;
 870
 871   DUMP_VECT_SCOPE ("get_loop_niters");
 872
 873   if (exits.is_empty ())
 874     return conds;
 875
 876   if (dump_enabled_p ())
 877     dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
 878                      exits.length ());
 879
 880   edge exit;
 881   unsigned int i;
 882   FOR_EACH_VEC_ELT (exits, i, exit)
 883     {
 884       gcond *cond = get_loop_exit_condition (exit);
 885       if (cond)
 886         conds.safe_push (cond);
 887
 888       if (dump_enabled_p ())
 889         dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
 890
 891       if (exit != main_exit)
 892         continue;
 893
 894       may_be_zero = NULL_TREE;
 895       if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 896           || chrec_contains_undetermined (niter_desc.niter))
 897         continue;
 898
 899       niter_assumptions = niter_desc.assumptions;
 900       may_be_zero = niter_desc.may_be_zero;
 901       niter = niter_desc.niter;
 902
 903       if (may_be_zero && integer_zerop (may_be_zero))
 904         may_be_zero = NULL_TREE;
 905
 906       if (may_be_zero)
 907         {
 908           if (COMPARISON_CLASS_P (may_be_zero))
 909             {
 910               /* Try to combine may_be_zero with assumptions, this can simplify
 911                  computation of niter expression.  */
 912               if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 913                 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 914                                                  niter_assumptions,
 915                                                  fold_build1 (TRUTH_NOT_EXPR,
 916                                                               boolean_type_node,
 917                                                               may_be_zero));
 918               else
 919                 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 920                                      build_int_cst (TREE_TYPE (niter), 0),
 921                                      rewrite_to_non_trapping_overflow (niter));
 922
 923               may_be_zero = NULL_TREE;
 924             }
 925           else if (integer_nonzerop (may_be_zero))
 926             {
 927               *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 928               *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 929               continue;
 930             }
 931           else
 932             continue;
 933        }
 934
 935       /* Loop assumptions are based off the normal exit.  */
 936       *assumptions = niter_assumptions;
 937       *number_of_iterationsm1 = niter;
 938
 939       /* We want the number of loop header executions which is the number
 940          of latch executions plus one.
 941          ???  For UINT_MAX latch executions this number overflows to zero
 942          for loops like do { n++; } while (n != 0);  */
 943       if (niter && !chrec_contains_undetermined (niter))
 944           niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
 945                                unshare_expr (niter),
 946                                build_int_cst (TREE_TYPE (niter), 1));
 947       *number_of_iterations = niter;
 948     }
 949
 950   if (dump_enabled_p ())
 951     dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
 952
 953   return conds;
 954 }
 955
 956 /*  Determine the main loop exit for the vectorizer.  */
 957
 958 edge
 959 vec_init_loop_exit_info (class loop *loop)
 960 {
 961   /* Before we begin we must first determine which exit is the main one and
 962      which are auxilary exits.  */
 963   auto_vec<edge> exits = get_loop_exit_edges (loop);
 964   if (exits.length () == 1)
 965     return exits[0];
 966
 967   /* If we have multiple exits we only support counting IV at the moment.  Analyze
 968      all exits and return one */
 969   class tree_niter_desc niter_desc;
 970   edge candidate = NULL;
 971   for (edge exit : exits)
 972     {
 973       if (!get_loop_exit_condition (exit))
 974         continue;
 975
 976       if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 977           && !chrec_contains_undetermined (niter_desc.niter))
 978         {
 979           if (!niter_desc.may_be_zero || !candidate)
 980             candidate = exit;
 981         }
 982     }
 983
 984   return candidate;
 985 }
 986
 987 /* Function bb_in_loop_p
 988
 989    Used as predicate for dfs order traversal of the loop bbs.  */
 990
 991 static bool
 992 bb_in_loop_p (const_basic_block bb, const void *data)
 993 {
 994   const class loop *const loop = (const class loop *)data;
 995   if (flow_bb_inside_loop_p (loop, bb))
 996     return true;
 997   return false;
 998 }
 999
1000
1001 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1002    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1003
1004 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1005   : vec_info (vec_info::loop, shared),
1006     loop (loop_in),
1007     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1008     num_itersm1 (NULL_TREE),
1009     num_iters (NULL_TREE),
1010     num_iters_unchanged (NULL_TREE),
1011     num_iters_assumptions (NULL_TREE),
1012     vector_costs (nullptr),
1013     scalar_costs (nullptr),
1014     th (0),
1015     versioning_threshold (0),
1016     vectorization_factor (0),
1017     main_loop_edge (nullptr),
1018     skip_main_loop_edge (nullptr),
1019     skip_this_loop_edge (nullptr),
1020     reusable_accumulators (),
1021     suggested_unroll_factor (1),
1022     max_vectorization_factor (0),
1023     mask_skip_niters (NULL_TREE),
1024     rgroup_compare_type (NULL_TREE),
1025     simd_if_cond (NULL_TREE),
1026     partial_vector_style (vect_partial_vectors_none),
1027     unaligned_dr (NULL),
1028     peeling_for_alignment (0),
1029     ptr_mask (0),
1030     ivexpr_map (NULL),
1031     scan_map (NULL),
1032     slp_unrolling_factor (1),
1033     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1034     vectorizable (false),
1035     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1036     using_partial_vectors_p (false),
1037     using_decrementing_iv_p (false),
1038     using_select_vl_p (false),
1039     epil_using_partial_vectors_p (false),
1040     partial_load_store_bias (0),
1041     peeling_for_gaps (false),
1042     peeling_for_niter (false),
1043     early_breaks (false),
1044     no_data_dependencies (false),
1045     has_mask_store (false),
1046     scalar_loop_scaling (profile_probability::uninitialized ()),
1047     scalar_loop (NULL),
1048     orig_loop_info (NULL),
1049     vec_loop_iv_exit (NULL),
1050     vec_epilogue_loop_iv_exit (NULL),
1051     scalar_loop_iv_exit (NULL)
1052 {
1053   /* CHECKME: We want to visit all BBs before their successors (except for
1054      latch blocks, for which this assertion wouldn't hold).  In the simple
1055      case of the loop forms we allow, a dfs order of the BBs would the same
1056      as reversed postorder traversal, so we are safe.  */
1057
1058   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1059                                           bbs, loop->num_nodes, loop);
1060   gcc_assert (nbbs == loop->num_nodes);
1061
1062   for (unsigned int i = 0; i < nbbs; i++)
1063     {
1064       basic_block bb = bbs[i];
1065       gimple_stmt_iterator si;
1066
1067       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1068         {
1069           gimple *phi = gsi_stmt (si);
1070           gimple_set_uid (phi, 0);
1071           add_stmt (phi);
1072         }
1073
1074       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1075         {
1076           gimple *stmt = gsi_stmt (si);
1077           gimple_set_uid (stmt, 0);
1078           if (is_gimple_debug (stmt))
1079             continue;
1080           add_stmt (stmt);
1081           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1082              third argument is the #pragma omp simd if (x) condition, when 0,
1083              loop shouldn't be vectorized, when non-zero constant, it should
1084              be vectorized normally, otherwise versioned with vectorized loop
1085              done if the condition is non-zero at runtime.  */
1086           if (loop_in->simduid
1087               && is_gimple_call (stmt)
1088               && gimple_call_internal_p (stmt)
1089               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1090               && gimple_call_num_args (stmt) >= 3
1091               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1092               && (loop_in->simduid
1093                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1094             {
1095               tree arg = gimple_call_arg (stmt, 2);
1096               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1097                 simd_if_cond = arg;
1098               else
1099                 gcc_assert (integer_nonzerop (arg));
1100             }
1101         }
1102     }
1103
1104   epilogue_vinfos.create (6);
1105 }
1106
1107 /* Free all levels of rgroup CONTROLS.  */
1108
1109 void
1110 release_vec_loop_controls (vec<rgroup_controls> *controls)
1111 {
1112   rgroup_controls *rgc;
1113   unsigned int i;
1114   FOR_EACH_VEC_ELT (*controls, i, rgc)
1115     rgc->controls.release ();
1116   controls->release ();
1117 }
1118
1119 /* Free all memory used by the _loop_vec_info, as well as all the
1120    stmt_vec_info structs of all the stmts in the loop.  */
1121
1122 _loop_vec_info::~_loop_vec_info ()
1123 {
1124   free (bbs);
1125
1126   release_vec_loop_controls (&masks.rgc_vec);
1127   release_vec_loop_controls (&lens);
1128   delete ivexpr_map;
1129   delete scan_map;
1130   epilogue_vinfos.release ();
1131   delete scalar_costs;
1132   delete vector_costs;
1133
1134   /* When we release an epiloge vinfo that we do not intend to use
1135      avoid clearing AUX of the main loop which should continue to
1136      point to the main loop vinfo since otherwise we'll leak that.  */
1137   if (loop->aux == this)
1138     loop->aux = NULL;
1139 }
1140
1141 /* Return an invariant or register for EXPR and emit necessary
1142    computations in the LOOP_VINFO loop preheader.  */
1143
1144 tree
1145 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1146 {
1147   if (is_gimple_reg (expr)
1148       || is_gimple_min_invariant (expr))
1149     return expr;
1150
1151   if (! loop_vinfo->ivexpr_map)
1152     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1153   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1154   if (! cached)
1155     {
1156       gimple_seq stmts = NULL;
1157       cached = force_gimple_operand (unshare_expr (expr),
1158                                      &stmts, true, NULL_TREE);
1159       if (stmts)
1160         {
1161           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1162           gsi_insert_seq_on_edge_immediate (e, stmts);
1163         }
1164     }
1165   return cached;
1166 }
1167
1168 /* Return true if we can use CMP_TYPE as the comparison type to produce
1169    all masks required to mask LOOP_VINFO.  */
1170
1171 static bool
1172 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1173 {
1174   rgroup_controls *rgm;
1175   unsigned int i;
1176   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1177     if (rgm->type != NULL_TREE
1178         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1179                                             cmp_type, rgm->type,
1180                                             OPTIMIZE_FOR_SPEED))
1181       return false;
1182   return true;
1183 }
1184
1185 /* Calculate the maximum number of scalars per iteration for every
1186    rgroup in LOOP_VINFO.  */
1187
1188 static unsigned int
1189 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1190 {
1191   unsigned int res = 1;
1192   unsigned int i;
1193   rgroup_controls *rgm;
1194   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1195     res = MAX (res, rgm->max_nscalars_per_iter);
1196   return res;
1197 }
1198
1199 /* Calculate the minimum precision necessary to represent:
1200
1201       MAX_NITERS * FACTOR
1202
1203    as an unsigned integer, where MAX_NITERS is the maximum number of
1204    loop header iterations for the original scalar form of LOOP_VINFO.  */
1205
1206 static unsigned
1207 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1208 {
1209   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1210
1211   /* Get the maximum number of iterations that is representable
1212      in the counter type.  */
1213   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1214   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1215
1216   /* Get a more refined estimate for the number of iterations.  */
1217   widest_int max_back_edges;
1218   if (max_loop_iterations (loop, &max_back_edges))
1219     max_ni = wi::smin (max_ni, max_back_edges + 1);
1220
1221   /* Work out how many bits we need to represent the limit.  */
1222   return wi::min_precision (max_ni * factor, UNSIGNED);
1223 }
1224
1225 /* True if the loop needs peeling or partial vectors when vectorized.  */
1226
1227 static bool
1228 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1229 {
1230   unsigned HOST_WIDE_INT const_vf;
1231   HOST_WIDE_INT max_niter
1232     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1233
1234   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1235   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1236     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1237                                           (loop_vinfo));
1238
1239   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1240       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1241     {
1242       /* Work out the (constant) number of iterations that need to be
1243          peeled for reasons other than niters.  */
1244       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1245       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1246         peel_niter += 1;
1247       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1248                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1249         return true;
1250     }
1251   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1252       /* ??? When peeling for gaps but not alignment, we could
1253          try to check whether the (variable) niters is known to be
1254          VF * N + 1.  That's something of a niche case though.  */
1255       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1256       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1257       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1258            < (unsigned) exact_log2 (const_vf))
1259           /* In case of versioning, check if the maximum number of
1260              iterations is greater than th.  If they are identical,
1261              the epilogue is unnecessary.  */
1262           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1263               || ((unsigned HOST_WIDE_INT) max_niter
1264                   /* We'd like to use LOOP_VINFO_VERSIONING_THRESHOLD
1265                      but that's only computed later based on our result.
1266                      The following is the most conservative approximation.  */
1267                   > (std::max ((unsigned HOST_WIDE_INT) th,
1268                                const_vf) / const_vf) * const_vf))))
1269     return true;
1270
1271   return false;
1272 }
1273
1274 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1275    whether we can actually generate the masks required.  Return true if so,
1276    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1277
1278 static bool
1279 vect_verify_full_masking (loop_vec_info loop_vinfo)
1280 {
1281   unsigned int min_ni_width;
1282
1283   /* Use a normal loop if there are no statements that need masking.
1284      This only happens in rare degenerate cases: it means that the loop
1285      has no loads, no stores, and no live-out values.  */
1286   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1287     return false;
1288
1289   /* Produce the rgroup controls.  */
1290   for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1291     {
1292       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1293       tree vectype = mask.first;
1294       unsigned nvectors = mask.second;
1295
1296       if (masks->rgc_vec.length () < nvectors)
1297         masks->rgc_vec.safe_grow_cleared (nvectors, true);
1298       rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1299       /* The number of scalars per iteration and the number of vectors are
1300          both compile-time constants.  */
1301       unsigned int nscalars_per_iter
1302           = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1303                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1304
1305       if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1306         {
1307           rgm->max_nscalars_per_iter = nscalars_per_iter;
1308           rgm->type = truth_type_for (vectype);
1309           rgm->factor = 1;
1310         }
1311     }
1312
1313   unsigned int max_nscalars_per_iter
1314     = vect_get_max_nscalars_per_iter (loop_vinfo);
1315
1316   /* Work out how many bits we need to represent the limit.  */
1317   min_ni_width
1318     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1319
1320   /* Find a scalar mode for which WHILE_ULT is supported.  */
1321   opt_scalar_int_mode cmp_mode_iter;
1322   tree cmp_type = NULL_TREE;
1323   tree iv_type = NULL_TREE;
1324   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1325   unsigned int iv_precision = UINT_MAX;
1326
1327   if (iv_limit != -1)
1328     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1329                                       UNSIGNED);
1330
1331   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1332     {
1333       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1334       if (cmp_bits >= min_ni_width
1335           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1336         {
1337           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1338           if (this_type
1339               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1340             {
1341               /* Although we could stop as soon as we find a valid mode,
1342                  there are at least two reasons why that's not always the
1343                  best choice:
1344
1345                  - An IV that's Pmode or wider is more likely to be reusable
1346                    in address calculations than an IV that's narrower than
1347                    Pmode.
1348
1349                  - Doing the comparison in IV_PRECISION or wider allows
1350                    a natural 0-based IV, whereas using a narrower comparison
1351                    type requires mitigations against wrap-around.
1352
1353                  Conversely, if the IV limit is variable, doing the comparison
1354                  in a wider type than the original type can introduce
1355                  unnecessary extensions, so picking the widest valid mode
1356                  is not always a good choice either.
1357
1358                  Here we prefer the first IV type that's Pmode or wider,
1359                  and the first comparison type that's IV_PRECISION or wider.
1360                  (The comparison type must be no wider than the IV type,
1361                  to avoid extensions in the vector loop.)
1362
1363                  ??? We might want to try continuing beyond Pmode for ILP32
1364                  targets if CMP_BITS < IV_PRECISION.  */
1365               iv_type = this_type;
1366               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1367                 cmp_type = this_type;
1368               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1369                 break;
1370             }
1371         }
1372     }
1373
1374   if (!cmp_type)
1375     {
1376       LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1377       return false;
1378     }
1379
1380   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1381   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1382   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1383   return true;
1384 }
1385
1386 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1387    whether we can actually generate AVX512 style masks.  Return true if so,
1388    storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE.  */
1389
1390 static bool
1391 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1392 {
1393   /* Produce differently organized rgc_vec and differently check
1394      we can produce masks.  */
1395
1396   /* Use a normal loop if there are no statements that need masking.
1397      This only happens in rare degenerate cases: it means that the loop
1398      has no loads, no stores, and no live-out values.  */
1399   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1400     return false;
1401
1402   /* For the decrementing IV we need to represent all values in
1403      [0, niter + niter_skip] where niter_skip is the elements we
1404      skip in the first iteration for prologue peeling.  */
1405   tree iv_type = NULL_TREE;
1406   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1407   unsigned int iv_precision = UINT_MAX;
1408   if (iv_limit != -1)
1409     iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1410
1411   /* First compute the type for the IV we use to track the remaining
1412      scalar iterations.  */
1413   opt_scalar_int_mode cmp_mode_iter;
1414   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1415     {
1416       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1417       if (cmp_bits >= iv_precision
1418           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1419         {
1420           iv_type = build_nonstandard_integer_type (cmp_bits, true);
1421           if (iv_type)
1422             break;
1423         }
1424     }
1425   if (!iv_type)
1426     return false;
1427
1428   /* Produce the rgroup controls.  */
1429   for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1430     {
1431       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1432       tree vectype = mask.first;
1433       unsigned nvectors = mask.second;
1434
1435       /* The number of scalars per iteration and the number of vectors are
1436          both compile-time constants.  */
1437       unsigned int nscalars_per_iter
1438         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1439                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1440
1441       /* We index the rgroup_controls vector with nscalars_per_iter
1442          which we keep constant and instead have a varying nvectors,
1443          remembering the vector mask with the fewest nV.  */
1444       if (masks->rgc_vec.length () < nscalars_per_iter)
1445         masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1446       rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1447
1448       if (!rgm->type || rgm->factor > nvectors)
1449         {
1450           rgm->type = truth_type_for (vectype);
1451           rgm->compare_type = NULL_TREE;
1452           rgm->max_nscalars_per_iter = nscalars_per_iter;
1453           rgm->factor = nvectors;
1454           rgm->bias_adjusted_ctrl = NULL_TREE;
1455         }
1456     }
1457
1458   /* There is no fixed compare type we are going to use but we have to
1459      be able to get at one for each mask group.  */
1460   unsigned int min_ni_width
1461     = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1462
1463   bool ok = true;
1464   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1465     {
1466       tree mask_type = rgc.type;
1467       if (!mask_type)
1468         continue;
1469
1470       /* For now vect_get_loop_mask only supports integer mode masks
1471          when we need to split it.  */
1472       if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1473           || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1474         {
1475           ok = false;
1476           break;
1477         }
1478
1479       /* If iv_type is usable as compare type use that - we can elide the
1480          saturation in that case.   */
1481       if (TYPE_PRECISION (iv_type) >= min_ni_width)
1482         {
1483           tree cmp_vectype
1484             = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1485           if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1486             rgc.compare_type = cmp_vectype;
1487         }
1488       if (!rgc.compare_type)
1489         FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1490           {
1491             unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1492             if (cmp_bits >= min_ni_width
1493                 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1494               {
1495                 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1496                 if (!cmp_type)
1497                   continue;
1498
1499                 /* Check whether we can produce the mask with cmp_type.  */
1500                 tree cmp_vectype
1501                   = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1502                 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1503                   {
1504                     rgc.compare_type = cmp_vectype;
1505                     break;
1506                   }
1507               }
1508         }
1509       if (!rgc.compare_type)
1510         {
1511           ok = false;
1512           break;
1513         }
1514     }
1515   if (!ok)
1516     {
1517       release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1518       return false;
1519     }
1520
1521   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1522   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1523   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1524   return true;
1525 }
1526
1527 /* Check whether we can use vector access with length based on precison
1528    comparison.  So far, to keep it simple, we only allow the case that the
1529    precision of the target supported length is larger than the precision
1530    required by loop niters.  */
1531
1532 static bool
1533 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1534 {
1535   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1536     return false;
1537
1538   machine_mode len_load_mode, len_store_mode;
1539   if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1540          .exists (&len_load_mode))
1541     return false;
1542   if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1543          .exists (&len_store_mode))
1544     return false;
1545
1546   signed char partial_load_bias = internal_len_load_store_bias
1547     (IFN_LEN_LOAD, len_load_mode);
1548
1549   signed char partial_store_bias = internal_len_load_store_bias
1550     (IFN_LEN_STORE, len_store_mode);
1551
1552   gcc_assert (partial_load_bias == partial_store_bias);
1553
1554   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1555     return false;
1556
1557   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1558      len_loads with a length of zero.  In order to avoid that we prohibit
1559      more than one loop length here.  */
1560   if (partial_load_bias == -1
1561       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1562     return false;
1563
1564   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1565
1566   unsigned int max_nitems_per_iter = 1;
1567   unsigned int i;
1568   rgroup_controls *rgl;
1569   /* Find the maximum number of items per iteration for every rgroup.  */
1570   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1571     {
1572       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1573       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1574     }
1575
1576   /* Work out how many bits we need to represent the length limit.  */
1577   unsigned int min_ni_prec
1578     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1579
1580   /* Now use the maximum of below precisions for one suitable IV type:
1581      - the IV's natural precision
1582      - the precision needed to hold: the maximum number of scalar
1583        iterations multiplied by the scale factor (min_ni_prec above)
1584      - the Pmode precision
1585
1586      If min_ni_prec is less than the precision of the current niters,
1587      we perfer to still use the niters type.  Prefer to use Pmode and
1588      wider IV to avoid narrow conversions.  */
1589
1590   unsigned int ni_prec
1591     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1592   min_ni_prec = MAX (min_ni_prec, ni_prec);
1593   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1594
1595   tree iv_type = NULL_TREE;
1596   opt_scalar_int_mode tmode_iter;
1597   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1598     {
1599       scalar_mode tmode = tmode_iter.require ();
1600       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1601
1602       /* ??? Do we really want to construct one IV whose precision exceeds
1603          BITS_PER_WORD?  */
1604       if (tbits > BITS_PER_WORD)
1605         break;
1606
1607       /* Find the first available standard integral type.  */
1608       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1609         {
1610           iv_type = build_nonstandard_integer_type (tbits, true);
1611           break;
1612         }
1613     }
1614
1615   if (!iv_type)
1616     {
1617       if (dump_enabled_p ())
1618         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1619                          "can't vectorize with length-based partial vectors"
1620                          " because there is no suitable iv type.\n");
1621       return false;
1622     }
1623
1624   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1625   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1626   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1627
1628   return true;
1629 }
1630
1631 /* Calculate the cost of one scalar iteration of the loop.  */
1632 static void
1633 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1634 {
1635   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1636   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1637   int nbbs = loop->num_nodes, factor;
1638   int innerloop_iters, i;
1639
1640   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1641
1642   /* Gather costs for statements in the scalar loop.  */
1643
1644   /* FORNOW.  */
1645   innerloop_iters = 1;
1646   if (loop->inner)
1647     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1648
1649   for (i = 0; i < nbbs; i++)
1650     {
1651       gimple_stmt_iterator si;
1652       basic_block bb = bbs[i];
1653
1654       if (bb->loop_father == loop->inner)
1655         factor = innerloop_iters;
1656       else
1657         factor = 1;
1658
1659       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1660         {
1661           gimple *stmt = gsi_stmt (si);
1662           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1663
1664           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1665             continue;
1666
1667           /* Skip stmts that are not vectorized inside the loop.  */
1668           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1669           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1670               && (!STMT_VINFO_LIVE_P (vstmt_info)
1671                   || !VECTORIZABLE_CYCLE_DEF
1672                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1673             continue;
1674
1675           vect_cost_for_stmt kind;
1676           if (STMT_VINFO_DATA_REF (stmt_info))
1677             {
1678               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1679                kind = scalar_load;
1680              else
1681                kind = scalar_store;
1682             }
1683           else if (vect_nop_conversion_p (stmt_info))
1684             continue;
1685           else
1686             kind = scalar_stmt;
1687
1688           /* We are using vect_prologue here to avoid scaling twice
1689              by the inner loop factor.  */
1690           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1691                             factor, kind, stmt_info, 0, vect_prologue);
1692         }
1693     }
1694
1695   /* Now accumulate cost.  */
1696   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1697   add_stmt_costs (loop_vinfo->scalar_costs,
1698                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1699   loop_vinfo->scalar_costs->finish_cost (nullptr);
1700 }
1701
1702 /* Function vect_analyze_loop_form.
1703
1704    Verify that certain CFG restrictions hold, including:
1705    - the loop has a pre-header
1706    - the loop has a single entry
1707    - nested loops can have only a single exit.
1708    - the loop exit condition is simple enough
1709    - the number of iterations can be analyzed, i.e, a countable loop.  The
1710      niter could be analyzed under some assumptions.  */
1711
1712 opt_result
1713 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1714 {
1715   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1716
1717   edge exit_e = vec_init_loop_exit_info (loop);
1718   if (!exit_e)
1719     return opt_result::failure_at (vect_location,
1720                                    "not vectorized:"
1721                                    " could not determine main exit from"
1722                                    " loop with multiple exits.\n");
1723   info->loop_exit = exit_e;
1724   if (dump_enabled_p ())
1725       dump_printf_loc (MSG_NOTE, vect_location,
1726                        "using as main loop exit: %d -> %d [AUX: %p]\n",
1727                        exit_e->src->index, exit_e->dest->index, exit_e->aux);
1728
1729   /* Check if we have any control flow that doesn't leave the loop.  */
1730   class loop *v_loop = loop->inner ? loop->inner : loop;
1731   basic_block *bbs= get_loop_body (v_loop);
1732   for (unsigned i = 0; i < v_loop->num_nodes; i++)
1733     if (EDGE_COUNT (bbs[i]->succs) != 1
1734         && (EDGE_COUNT (bbs[i]->succs) != 2
1735             || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1736       return opt_result::failure_at (vect_location,
1737                                      "not vectorized:"
1738                                      " unsupported control flow in loop.\n");
1739
1740   /* Different restrictions apply when we are considering an inner-most loop,
1741      vs. an outer (nested) loop.
1742      (FORNOW. May want to relax some of these restrictions in the future).  */
1743
1744   info->inner_loop_cond = NULL;
1745   if (!loop->inner)
1746     {
1747       /* Inner-most loop.  We currently require that the number of BBs is
1748          exactly 2 (the header and latch).  Vectorizable inner-most loops
1749          look like this:
1750
1751                         (pre-header)
1752                            |
1753                           header <--------+
1754                            | |            |
1755                            | +--> latch --+
1756                            |
1757                         (exit-bb)  */
1758
1759       if (empty_block_p (loop->header))
1760         return opt_result::failure_at (vect_location,
1761                                        "not vectorized: empty loop.\n");
1762     }
1763   else
1764     {
1765       class loop *innerloop = loop->inner;
1766       edge entryedge;
1767
1768       /* Nested loop. We currently require that the loop is doubly-nested,
1769          contains a single inner loop, and the number of BBs is exactly 5.
1770          Vectorizable outer-loops look like this:
1771
1772                         (pre-header)
1773                            |
1774                           header <---+
1775                            |         |
1776                           inner-loop |
1777                            |         |
1778                           tail ------+
1779                            |
1780                         (exit-bb)
1781
1782          The inner-loop has the properties expected of inner-most loops
1783          as described above.  */
1784
1785       if ((loop->inner)->inner || (loop->inner)->next)
1786         return opt_result::failure_at (vect_location,
1787                                        "not vectorized:"
1788                                        " multiple nested loops.\n");
1789
1790       entryedge = loop_preheader_edge (innerloop);
1791       if (entryedge->src != loop->header
1792           || !single_exit (innerloop)
1793           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1794         return opt_result::failure_at (vect_location,
1795                                        "not vectorized:"
1796                                        " unsupported outerloop form.\n");
1797
1798       /* Analyze the inner-loop.  */
1799       vect_loop_form_info inner;
1800       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1801       if (!res)
1802         {
1803           if (dump_enabled_p ())
1804             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1805                              "not vectorized: Bad inner loop.\n");
1806           return res;
1807         }
1808
1809       /* Don't support analyzing niter under assumptions for inner
1810          loop.  */
1811       if (!integer_onep (inner.assumptions))
1812         return opt_result::failure_at (vect_location,
1813                                        "not vectorized: Bad inner loop.\n");
1814
1815       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1816         return opt_result::failure_at (vect_location,
1817                                        "not vectorized: inner-loop count not"
1818                                        " invariant.\n");
1819
1820       if (dump_enabled_p ())
1821         dump_printf_loc (MSG_NOTE, vect_location,
1822                          "Considering outer-loop vectorization.\n");
1823       info->inner_loop_cond = inner.conds[0];
1824     }
1825
1826   if (EDGE_COUNT (loop->header->preds) != 2)
1827     return opt_result::failure_at (vect_location,
1828                                    "not vectorized:"
1829                                    " too many incoming edges.\n");
1830
1831   /* We assume that the loop exit condition is at the end of the loop. i.e,
1832      that the loop is represented as a do-while (with a proper if-guard
1833      before the loop if needed), where the loop header contains all the
1834      executable statements, and the latch is empty.  */
1835   if (!empty_block_p (loop->latch)
1836       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1837     return opt_result::failure_at (vect_location,
1838                                    "not vectorized: latch block not empty.\n");
1839
1840   /* Make sure the exit is not abnormal.  */
1841   auto_vec<edge> exits = get_loop_exit_edges (loop);
1842   for (edge e : exits)
1843     {
1844       if (e->flags & EDGE_ABNORMAL)
1845         return opt_result::failure_at (vect_location,
1846                                        "not vectorized:"
1847                                        " abnormal loop exit edge.\n");
1848     }
1849
1850   info->conds
1851     = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1852                             &info->number_of_iterations,
1853                             &info->number_of_iterationsm1);
1854
1855   if (info->conds.is_empty ())
1856     return opt_result::failure_at
1857       (vect_location,
1858        "not vectorized: complicated exit condition.\n");
1859
1860   /* Determine what the primary and alternate exit conds are.  */
1861   for (unsigned i = 0; i < info->conds.length (); i++)
1862     {
1863       gcond *cond = info->conds[i];
1864       if (exit_e->src == gimple_bb (cond))
1865         std::swap (info->conds[0], info->conds[i]);
1866     }
1867
1868   if (integer_zerop (info->assumptions)
1869       || !info->number_of_iterations
1870       || chrec_contains_undetermined (info->number_of_iterations))
1871     return opt_result::failure_at
1872       (info->conds[0],
1873        "not vectorized: number of iterations cannot be computed.\n");
1874
1875   if (integer_zerop (info->number_of_iterations))
1876     return opt_result::failure_at
1877       (info->conds[0],
1878        "not vectorized: number of iterations = 0.\n");
1879
1880   if (!(tree_fits_shwi_p (info->number_of_iterations)
1881         && tree_to_shwi (info->number_of_iterations) > 0))
1882     {
1883       if (dump_enabled_p ())
1884         {
1885           dump_printf_loc (MSG_NOTE, vect_location,
1886                            "Symbolic number of iterations is ");
1887           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1888           dump_printf (MSG_NOTE, "\n");
1889         }
1890     }
1891
1892   return opt_result::success ();
1893 }
1894
1895 /* Create a loop_vec_info for LOOP with SHARED and the
1896    vect_analyze_loop_form result.  */
1897
1898 loop_vec_info
1899 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1900                         const vect_loop_form_info *info,
1901                         loop_vec_info main_loop_info)
1902 {
1903   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1904   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1905   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1906   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1907   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1908   /* Also record the assumptions for versioning.  */
1909   if (!integer_onep (info->assumptions) && !main_loop_info)
1910     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1911
1912   for (gcond *cond : info->conds)
1913     {
1914       stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1915       STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1916       /* Mark the statement as a condition.  */
1917       STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1918     }
1919
1920   for (unsigned i = 1; i < info->conds.length (); i ++)
1921     LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1922   LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1923
1924   LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1925
1926   /* Check to see if we're vectorizing multiple exits.  */
1927   LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1928     = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1929
1930   if (info->inner_loop_cond)
1931     {
1932       stmt_vec_info inner_loop_cond_info
1933         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1934       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1935       /* If we have an estimate on the number of iterations of the inner
1936          loop use that to limit the scale for costing, otherwise use
1937          --param vect-inner-loop-cost-factor literally.  */
1938       widest_int nit;
1939       if (estimated_stmt_executions (loop->inner, &nit))
1940         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1941           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1942     }
1943
1944   return loop_vinfo;
1945 }
1946
1947
1948
1949 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1950    statements update the vectorization factor.  */
1951
1952 static void
1953 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1954 {
1955   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1956   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1957   int nbbs = loop->num_nodes;
1958   poly_uint64 vectorization_factor;
1959   int i;
1960
1961   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1962
1963   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1964   gcc_assert (known_ne (vectorization_factor, 0U));
1965
1966   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1967      vectorization factor of the loop is the unrolling factor required by
1968      the SLP instances.  If that unrolling factor is 1, we say, that we
1969      perform pure SLP on loop - cross iteration parallelism is not
1970      exploited.  */
1971   bool only_slp_in_loop = true;
1972   for (i = 0; i < nbbs; i++)
1973     {
1974       basic_block bb = bbs[i];
1975       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1976            gsi_next (&si))
1977         {
1978           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1979           if (!stmt_info)
1980             continue;
1981           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1982                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1983               && !PURE_SLP_STMT (stmt_info))
1984             /* STMT needs both SLP and loop-based vectorization.  */
1985             only_slp_in_loop = false;
1986         }
1987       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1988            gsi_next (&si))
1989         {
1990           if (is_gimple_debug (gsi_stmt (si)))
1991             continue;
1992           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1993           stmt_info = vect_stmt_to_vectorize (stmt_info);
1994           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1995                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1996               && !PURE_SLP_STMT (stmt_info))
1997             /* STMT needs both SLP and loop-based vectorization.  */
1998             only_slp_in_loop = false;
1999         }
2000     }
2001
2002   if (only_slp_in_loop)
2003     {
2004       if (dump_enabled_p ())
2005         dump_printf_loc (MSG_NOTE, vect_location,
2006                          "Loop contains only SLP stmts\n");
2007       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
2008     }
2009   else
2010     {
2011       if (dump_enabled_p ())
2012         dump_printf_loc (MSG_NOTE, vect_location,
2013                          "Loop contains SLP and non-SLP stmts\n");
2014       /* Both the vectorization factor and unroll factor have the form
2015          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2016          so they must have a common multiple.  */
2017       vectorization_factor
2018         = force_common_multiple (vectorization_factor,
2019                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2020     }
2021
2022   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2023   if (dump_enabled_p ())
2024     {
2025       dump_printf_loc (MSG_NOTE, vect_location,
2026                        "Updating vectorization factor to ");
2027       dump_dec (MSG_NOTE, vectorization_factor);
2028       dump_printf (MSG_NOTE, ".\n");
2029     }
2030 }
2031
2032 /* Return true if STMT_INFO describes a double reduction phi and if
2033    the other phi in the reduction is also relevant for vectorization.
2034    This rejects cases such as:
2035
2036       outer1:
2037         x_1 = PHI <x_3(outer2), ...>;
2038         ...
2039
2040       inner:
2041         x_2 = ...;
2042         ...
2043
2044       outer2:
2045         x_3 = PHI <x_2(inner)>;
2046
2047    if nothing in x_2 or elsewhere makes x_1 relevant.  */
2048
2049 static bool
2050 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2051 {
2052   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2053     return false;
2054
2055   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2056 }
2057
2058 /* Function vect_analyze_loop_operations.
2059
2060    Scan the loop stmts and make sure they are all vectorizable.  */
2061
2062 static opt_result
2063 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2064 {
2065   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2066   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2067   int nbbs = loop->num_nodes;
2068   int i;
2069   stmt_vec_info stmt_info;
2070   bool need_to_vectorize = false;
2071   bool ok;
2072
2073   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2074
2075   auto_vec<stmt_info_for_cost> cost_vec;
2076
2077   for (i = 0; i < nbbs; i++)
2078     {
2079       basic_block bb = bbs[i];
2080
2081       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2082            gsi_next (&si))
2083         {
2084           gphi *phi = si.phi ();
2085           ok = true;
2086
2087           stmt_info = loop_vinfo->lookup_stmt (phi);
2088           if (dump_enabled_p ())
2089             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2090                              (gimple *) phi);
2091           if (virtual_operand_p (gimple_phi_result (phi)))
2092             continue;
2093
2094           /* Inner-loop loop-closed exit phi in outer-loop vectorization
2095              (i.e., a phi in the tail of the outer-loop).  */
2096           if (! is_loop_header_bb_p (bb))
2097             {
2098               /* FORNOW: we currently don't support the case that these phis
2099                  are not used in the outerloop (unless it is double reduction,
2100                  i.e., this phi is vect_reduction_def), cause this case
2101                  requires to actually do something here.  */
2102               if (STMT_VINFO_LIVE_P (stmt_info)
2103                   && !vect_active_double_reduction_p (stmt_info))
2104                 return opt_result::failure_at (phi,
2105                                                "Unsupported loop-closed phi"
2106                                                " in outer-loop.\n");
2107
2108               /* If PHI is used in the outer loop, we check that its operand
2109                  is defined in the inner loop.  */
2110               if (STMT_VINFO_RELEVANT_P (stmt_info))
2111                 {
2112                   tree phi_op;
2113
2114                   if (gimple_phi_num_args (phi) != 1)
2115                     return opt_result::failure_at (phi, "unsupported phi");
2116
2117                   phi_op = PHI_ARG_DEF (phi, 0);
2118                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2119                   if (!op_def_info)
2120                     return opt_result::failure_at (phi, "unsupported phi\n");
2121
2122                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2123                       && (STMT_VINFO_RELEVANT (op_def_info)
2124                           != vect_used_in_outer_by_reduction))
2125                     return opt_result::failure_at (phi, "unsupported phi\n");
2126
2127                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2128                        || (STMT_VINFO_DEF_TYPE (stmt_info)
2129                            == vect_double_reduction_def))
2130                       && !vectorizable_lc_phi (loop_vinfo,
2131                                                stmt_info, NULL, NULL))
2132                     return opt_result::failure_at (phi, "unsupported phi\n");
2133                 }
2134
2135               continue;
2136             }
2137
2138           gcc_assert (stmt_info);
2139
2140           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2141                || STMT_VINFO_LIVE_P (stmt_info))
2142               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2143               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2144             /* A scalar-dependence cycle that we don't support.  */
2145             return opt_result::failure_at (phi,
2146                                            "not vectorized:"
2147                                            " scalar dependence cycle.\n");
2148
2149           if (STMT_VINFO_RELEVANT_P (stmt_info))
2150             {
2151               need_to_vectorize = true;
2152               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2153                   && ! PURE_SLP_STMT (stmt_info))
2154                 ok = vectorizable_induction (loop_vinfo,
2155                                              stmt_info, NULL, NULL,
2156                                              &cost_vec);
2157               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2158                         || (STMT_VINFO_DEF_TYPE (stmt_info)
2159                             == vect_double_reduction_def)
2160                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2161                        && ! PURE_SLP_STMT (stmt_info))
2162                 ok = vectorizable_reduction (loop_vinfo,
2163                                              stmt_info, NULL, NULL, &cost_vec);
2164               else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2165                         == vect_first_order_recurrence)
2166                        && ! PURE_SLP_STMT (stmt_info))
2167                 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2168                                            &cost_vec);
2169             }
2170
2171           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
2172           if (ok
2173               && STMT_VINFO_LIVE_P (stmt_info)
2174               && !PURE_SLP_STMT (stmt_info))
2175             ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2176                                               -1, false, &cost_vec);
2177
2178           if (!ok)
2179             return opt_result::failure_at (phi,
2180                                            "not vectorized: relevant phi not "
2181                                            "supported: %G",
2182                                            static_cast <gimple *> (phi));
2183         }
2184
2185       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2186            gsi_next (&si))
2187         {
2188           gimple *stmt = gsi_stmt (si);
2189           if (!gimple_clobber_p (stmt)
2190               && !is_gimple_debug (stmt))
2191             {
2192               opt_result res
2193                 = vect_analyze_stmt (loop_vinfo,
2194                                      loop_vinfo->lookup_stmt (stmt),
2195                                      &need_to_vectorize,
2196                                      NULL, NULL, &cost_vec);
2197               if (!res)
2198                 return res;
2199             }
2200         }
2201     } /* bbs */
2202
2203   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2204
2205   /* All operations in the loop are either irrelevant (deal with loop
2206      control, or dead), or only used outside the loop and can be moved
2207      out of the loop (e.g. invariants, inductions).  The loop can be
2208      optimized away by scalar optimizations.  We're better off not
2209      touching this loop.  */
2210   if (!need_to_vectorize)
2211     {
2212       if (dump_enabled_p ())
2213         dump_printf_loc (MSG_NOTE, vect_location,
2214                          "All the computation can be taken out of the loop.\n");
2215       return opt_result::failure_at
2216         (vect_location,
2217          "not vectorized: redundant loop. no profit to vectorize.\n");
2218     }
2219
2220   return opt_result::success ();
2221 }
2222
2223 /* Return true if we know that the iteration count is smaller than the
2224    vectorization factor.  Return false if it isn't, or if we can't be sure
2225    either way.  */
2226
2227 static bool
2228 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2229 {
2230   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2231
2232   HOST_WIDE_INT max_niter;
2233   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2234     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2235   else
2236     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2237
2238   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2239     return true;
2240
2241   return false;
2242 }
2243
2244 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
2245    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
2246    definitely no, or -1 if it's worth retrying.  */
2247
2248 static int
2249 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2250                            unsigned *suggested_unroll_factor)
2251 {
2252   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2253   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2254
2255   /* Only loops that can handle partially-populated vectors can have iteration
2256      counts less than the vectorization factor.  */
2257   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2258       && vect_known_niters_smaller_than_vf (loop_vinfo))
2259     {
2260       if (dump_enabled_p ())
2261         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2262                          "not vectorized: iteration count smaller than "
2263                          "vectorization factor.\n");
2264       return 0;
2265     }
2266
2267   /* If we know the number of iterations we can do better, for the
2268      epilogue we can also decide whether the main loop leaves us
2269      with enough iterations, prefering a smaller vector epilog then
2270      also possibly used for the case we skip the vector loop.  */
2271   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2272     {
2273       widest_int scalar_niters
2274         = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2275       if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2276         {
2277           loop_vec_info orig_loop_vinfo
2278             = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2279           unsigned lowest_vf
2280             = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2281           int prolog_peeling = 0;
2282           if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2283             prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2284           if (prolog_peeling >= 0
2285               && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2286                            lowest_vf))
2287             {
2288               unsigned gap
2289                 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2290               scalar_niters = ((scalar_niters - gap - prolog_peeling)
2291                                % lowest_vf + gap);
2292             }
2293         }
2294       /* Reject vectorizing for a single scalar iteration, even if
2295          we could in principle implement that using partial vectors.  */
2296       unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2297       if (scalar_niters <= peeling_gap + 1)
2298         {
2299           if (dump_enabled_p ())
2300             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2301                              "not vectorized: loop only has a single "
2302                              "scalar iteration.\n");
2303           return 0;
2304         }
2305
2306       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2307         {
2308           /* Check that the loop processes at least one full vector.  */
2309           poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2310           if (known_lt (scalar_niters, vf))
2311             {
2312               if (dump_enabled_p ())
2313                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2314                                  "loop does not have enough iterations "
2315                                  "to support vectorization.\n");
2316               return 0;
2317             }
2318
2319           /* If we need to peel an extra epilogue iteration to handle data
2320              accesses with gaps, check that there are enough scalar iterations
2321              available.
2322
2323              The check above is redundant with this one when peeling for gaps,
2324              but the distinction is useful for diagnostics.  */
2325           if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2326               && known_le (scalar_niters, vf))
2327             {
2328               if (dump_enabled_p ())
2329                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2330                                  "loop does not have enough iterations "
2331                                  "to support peeling for gaps.\n");
2332               return 0;
2333             }
2334         }
2335     }
2336
2337   /* If using the "very cheap" model. reject cases in which we'd keep
2338      a copy of the scalar code (even if we might be able to vectorize it).  */
2339   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2340       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2341           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2342           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2343     {
2344       if (dump_enabled_p ())
2345         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2346                          "some scalar iterations would need to be peeled\n");
2347       return 0;
2348     }
2349
2350   int min_profitable_iters, min_profitable_estimate;
2351   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2352                                       &min_profitable_estimate,
2353                                       suggested_unroll_factor);
2354
2355   if (min_profitable_iters < 0)
2356     {
2357       if (dump_enabled_p ())
2358         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2359                          "not vectorized: vectorization not profitable.\n");
2360       if (dump_enabled_p ())
2361         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2362                          "not vectorized: vector version will never be "
2363                          "profitable.\n");
2364       return -1;
2365     }
2366
2367   int min_scalar_loop_bound = (param_min_vect_loop_bound
2368                                * assumed_vf);
2369
2370   /* Use the cost model only if it is more conservative than user specified
2371      threshold.  */
2372   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2373                                     min_profitable_iters);
2374
2375   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2376
2377   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2378       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2379     {
2380       if (dump_enabled_p ())
2381         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2382                          "not vectorized: vectorization not profitable.\n");
2383       if (dump_enabled_p ())
2384         dump_printf_loc (MSG_NOTE, vect_location,
2385                          "not vectorized: iteration count smaller than user "
2386                          "specified loop bound parameter or minimum profitable "
2387                          "iterations (whichever is more conservative).\n");
2388       return 0;
2389     }
2390
2391   /* The static profitablity threshold min_profitable_estimate includes
2392      the cost of having to check at runtime whether the scalar loop
2393      should be used instead.  If it turns out that we don't need or want
2394      such a check, the threshold we should use for the static estimate
2395      is simply the point at which the vector loop becomes more profitable
2396      than the scalar loop.  */
2397   if (min_profitable_estimate > min_profitable_iters
2398       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2399       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2400       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2401       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2402     {
2403       if (dump_enabled_p ())
2404         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2405                          " choice between the scalar and vector loops\n");
2406       min_profitable_estimate = min_profitable_iters;
2407     }
2408
2409   /* If the vector loop needs multiple iterations to be beneficial then
2410      things are probably too close to call, and the conservative thing
2411      would be to stick with the scalar code.  */
2412   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2413       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2414     {
2415       if (dump_enabled_p ())
2416         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2417                          "one iteration of the vector loop would be"
2418                          " more expensive than the equivalent number of"
2419                          " iterations of the scalar loop\n");
2420       return 0;
2421     }
2422
2423   HOST_WIDE_INT estimated_niter;
2424
2425   /* If we are vectorizing an epilogue then we know the maximum number of
2426      scalar iterations it will cover is at least one lower than the
2427      vectorization factor of the main loop.  */
2428   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2429     estimated_niter
2430       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2431   else
2432     {
2433       estimated_niter = estimated_stmt_executions_int (loop);
2434       if (estimated_niter == -1)
2435         estimated_niter = likely_max_stmt_executions_int (loop);
2436     }
2437   if (estimated_niter != -1
2438       && ((unsigned HOST_WIDE_INT) estimated_niter
2439           < MAX (th, (unsigned) min_profitable_estimate)))
2440     {
2441       if (dump_enabled_p ())
2442         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2443                          "not vectorized: estimated iteration count too "
2444                          "small.\n");
2445       if (dump_enabled_p ())
2446         dump_printf_loc (MSG_NOTE, vect_location,
2447                          "not vectorized: estimated iteration count smaller "
2448                          "than specified loop bound parameter or minimum "
2449                          "profitable iterations (whichever is more "
2450                          "conservative).\n");
2451       return -1;
2452     }
2453
2454   return 1;
2455 }
2456
2457 static opt_result
2458 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2459                            vec<data_reference_p> *datarefs,
2460                            unsigned int *n_stmts)
2461 {
2462   *n_stmts = 0;
2463   for (unsigned i = 0; i < loop->num_nodes; i++)
2464     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2465          !gsi_end_p (gsi); gsi_next (&gsi))
2466       {
2467         gimple *stmt = gsi_stmt (gsi);
2468         if (is_gimple_debug (stmt))
2469           continue;
2470         ++(*n_stmts);
2471         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2472                                                         NULL, 0);
2473         if (!res)
2474           {
2475             if (is_gimple_call (stmt) && loop->safelen)
2476               {
2477                 tree fndecl = gimple_call_fndecl (stmt), op;
2478                 if (fndecl == NULL_TREE
2479                     && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2480                   {
2481                     fndecl = gimple_call_arg (stmt, 0);
2482                     gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2483                     fndecl = TREE_OPERAND (fndecl, 0);
2484                     gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2485                   }
2486                 if (fndecl != NULL_TREE)
2487                   {
2488                     cgraph_node *node = cgraph_node::get (fndecl);
2489                     if (node != NULL && node->simd_clones != NULL)
2490                       {
2491                         unsigned int j, n = gimple_call_num_args (stmt);
2492                         for (j = 0; j < n; j++)
2493                           {
2494                             op = gimple_call_arg (stmt, j);
2495                             if (DECL_P (op)
2496                                 || (REFERENCE_CLASS_P (op)
2497                                     && get_base_address (op)))
2498                               break;
2499                           }
2500                         op = gimple_call_lhs (stmt);
2501                         /* Ignore #pragma omp declare simd functions
2502                            if they don't have data references in the
2503                            call stmt itself.  */
2504                         if (j == n
2505                             && !(op
2506                                  && (DECL_P (op)
2507                                      || (REFERENCE_CLASS_P (op)
2508                                          && get_base_address (op)))))
2509                           continue;
2510                       }
2511                   }
2512               }
2513             return res;
2514           }
2515         /* If dependence analysis will give up due to the limit on the
2516            number of datarefs stop here and fail fatally.  */
2517         if (datarefs->length ()
2518             > (unsigned)param_loop_max_datarefs_for_datadeps)
2519           return opt_result::failure_at (stmt, "exceeded param "
2520                                          "loop-max-datarefs-for-datadeps\n");
2521       }
2522   return opt_result::success ();
2523 }
2524
2525 /* Look for SLP-only access groups and turn each individual access into its own
2526    group.  */
2527 static void
2528 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2529 {
2530   unsigned int i;
2531   struct data_reference *dr;
2532
2533   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2534
2535   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2536   FOR_EACH_VEC_ELT (datarefs, i, dr)
2537     {
2538       gcc_assert (DR_REF (dr));
2539       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2540
2541       /* Check if the load is a part of an interleaving chain.  */
2542       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2543         {
2544           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2545           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2546           unsigned int group_size = DR_GROUP_SIZE (first_element);
2547
2548           /* Check if SLP-only groups.  */
2549           if (!STMT_SLP_TYPE (stmt_info)
2550               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2551             {
2552               /* Dissolve the group.  */
2553               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2554
2555               stmt_vec_info vinfo = first_element;
2556               while (vinfo)
2557                 {
2558                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2559                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2560                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2561                   DR_GROUP_SIZE (vinfo) = 1;
2562                   if (STMT_VINFO_STRIDED_P (first_element)
2563                       /* We cannot handle stores with gaps.  */
2564                       || DR_IS_WRITE (dr_info->dr))
2565                     {
2566                       STMT_VINFO_STRIDED_P (vinfo) = true;
2567                       DR_GROUP_GAP (vinfo) = 0;
2568                     }
2569                   else
2570                     DR_GROUP_GAP (vinfo) = group_size - 1;
2571                   /* Duplicate and adjust alignment info, it needs to
2572                      be present on each group leader, see dr_misalignment.  */
2573                   if (vinfo != first_element)
2574                     {
2575                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2576                       dr_info2->target_alignment = dr_info->target_alignment;
2577                       int misalignment = dr_info->misalignment;
2578                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2579                         {
2580                           HOST_WIDE_INT diff
2581                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2582                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2583                           unsigned HOST_WIDE_INT align_c
2584                             = dr_info->target_alignment.to_constant ();
2585                           misalignment = (misalignment + diff) % align_c;
2586                         }
2587                       dr_info2->misalignment = misalignment;
2588                     }
2589                   vinfo = next;
2590                 }
2591             }
2592         }
2593     }
2594 }
2595
2596 /* Determine if operating on full vectors for LOOP_VINFO might leave
2597    some scalar iterations still to do.  If so, decide how we should
2598    handle those scalar iterations.  The possibilities are:
2599
2600    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2601        In this case:
2602
2603          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2604          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2605          LOOP_VINFO_PEELING_FOR_NITER == false
2606
2607    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2608        to handle the remaining scalar iterations.  In this case:
2609
2610          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2611          LOOP_VINFO_PEELING_FOR_NITER == true
2612
2613        There are two choices:
2614
2615        (2a) Consider vectorizing the epilogue loop at the same VF as the
2616             main loop, but using partial vectors instead of full vectors.
2617             In this case:
2618
2619               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2620
2621        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2622             In this case:
2623
2624               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2625  */
2626
2627 opt_result
2628 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2629 {
2630   /* Determine whether there would be any scalar iterations left over.  */
2631   bool need_peeling_or_partial_vectors_p
2632     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2633
2634   /* Decide whether to vectorize the loop with partial vectors.  */
2635   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2636   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2637   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2638       && need_peeling_or_partial_vectors_p)
2639     {
2640       /* For partial-vector-usage=1, try to push the handling of partial
2641          vectors to the epilogue, with the main loop continuing to operate
2642          on full vectors.
2643
2644          If we are unrolling we also do not want to use partial vectors. This
2645          is to avoid the overhead of generating multiple masks and also to
2646          avoid having to execute entire iterations of FALSE masked instructions
2647          when dealing with one or less full iterations.
2648
2649          ??? We could then end up failing to use partial vectors if we
2650          decide to peel iterations into a prologue, and if the main loop
2651          then ends up processing fewer than VF iterations.  */
2652       if ((param_vect_partial_vector_usage == 1
2653            || loop_vinfo->suggested_unroll_factor > 1)
2654           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2655           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2656         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2657       else
2658         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2659     }
2660
2661   if (dump_enabled_p ())
2662     dump_printf_loc (MSG_NOTE, vect_location,
2663                      "operating on %s vectors%s.\n",
2664                      LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2665                      ? "partial" : "full",
2666                      LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2667                      ? " for epilogue loop" : "");
2668
2669   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2670     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2671        && need_peeling_or_partial_vectors_p);
2672
2673   /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2674      analysis that we don't know whether the loop is vectorized by partial
2675      vectors (More details see tree-vect-loop-manip.cc).
2676
2677      However, SELECT_VL vectorizaton style should only applied on partial
2678      vectorization since SELECT_VL is the GIMPLE IR that calculates the
2679      number of elements to be process for each iteration.
2680
2681      After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2682      if it is not partial vectorized loop.  */
2683   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2684     LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2685
2686   return opt_result::success ();
2687 }
2688
2689 /* Function vect_analyze_loop_2.
2690
2691    Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2692    analyses will record information in some members of LOOP_VINFO.  FATAL
2693    indicates if some analysis meets fatal error.  If one non-NULL pointer
2694    SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2695    worked out suggested unroll factor, while one NULL pointer shows it's
2696    going to apply the suggested unroll factor.  SLP_DONE_FOR_SUGGESTED_UF
2697    is to hold the slp decision when the suggested unroll factor is worked
2698    out.  */
2699 static opt_result
2700 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2701                      unsigned *suggested_unroll_factor,
2702                      bool& slp_done_for_suggested_uf)
2703 {
2704   opt_result ok = opt_result::success ();
2705   int res;
2706   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2707   poly_uint64 min_vf = 2;
2708   loop_vec_info orig_loop_vinfo = NULL;
2709
2710   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2711      loop_vec_info of the first vectorized loop.  */
2712   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2713     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2714   else
2715     orig_loop_vinfo = loop_vinfo;
2716   gcc_assert (orig_loop_vinfo);
2717
2718   /* The first group of checks is independent of the vector size.  */
2719   fatal = true;
2720
2721   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2722       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2723     return opt_result::failure_at (vect_location,
2724                                    "not vectorized: simd if(0)\n");
2725
2726   /* Find all data references in the loop (which correspond to vdefs/vuses)
2727      and analyze their evolution in the loop.  */
2728
2729   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2730
2731   /* Gather the data references and count stmts in the loop.  */
2732   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2733     {
2734       opt_result res
2735         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2736                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2737                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2738       if (!res)
2739         {
2740           if (dump_enabled_p ())
2741             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2742                              "not vectorized: loop contains function "
2743                              "calls or data references that cannot "
2744                              "be analyzed\n");
2745           return res;
2746         }
2747       loop_vinfo->shared->save_datarefs ();
2748     }
2749   else
2750     loop_vinfo->shared->check_datarefs ();
2751
2752   /* Analyze the data references and also adjust the minimal
2753      vectorization factor according to the loads and stores.  */
2754
2755   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2756   if (!ok)
2757     {
2758       if (dump_enabled_p ())
2759         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2760                          "bad data references.\n");
2761       return ok;
2762     }
2763
2764   /* Check if we are applying unroll factor now.  */
2765   bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2766   gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2767
2768   /* If the slp decision is false when suggested unroll factor is worked
2769      out, and we are applying suggested unroll factor, we can simply skip
2770      all slp related analyses this time.  */
2771   bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2772
2773   /* Classify all cross-iteration scalar data-flow cycles.
2774      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2775   vect_analyze_scalar_cycles (loop_vinfo, slp);
2776
2777   vect_pattern_recog (loop_vinfo);
2778
2779   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2780
2781   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2782      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2783
2784   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2785   if (!ok)
2786     {
2787       if (dump_enabled_p ())
2788         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2789                          "bad data access.\n");
2790       return ok;
2791     }
2792
2793   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2794
2795   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2796   if (!ok)
2797     {
2798       if (dump_enabled_p ())
2799         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2800                          "unexpected pattern.\n");
2801       return ok;
2802     }
2803
2804   /* While the rest of the analysis below depends on it in some way.  */
2805   fatal = false;
2806
2807   /* Analyze data dependences between the data-refs in the loop
2808      and adjust the maximum vectorization factor according to
2809      the dependences.
2810      FORNOW: fail at the first data dependence that we encounter.  */
2811
2812   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2813   if (!ok)
2814     {
2815       if (dump_enabled_p ())
2816         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2817                          "bad data dependence.\n");
2818       return ok;
2819     }
2820   if (max_vf != MAX_VECTORIZATION_FACTOR
2821       && maybe_lt (max_vf, min_vf))
2822     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2823   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2824
2825   ok = vect_determine_vectorization_factor (loop_vinfo);
2826   if (!ok)
2827     {
2828       if (dump_enabled_p ())
2829         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2830                          "can't determine vectorization factor.\n");
2831       return ok;
2832     }
2833
2834   /* Compute the scalar iteration cost.  */
2835   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2836
2837   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2838
2839   if (slp)
2840     {
2841       /* Check the SLP opportunities in the loop, analyze and build
2842          SLP trees.  */
2843       ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2844       if (!ok)
2845         return ok;
2846
2847       /* If there are any SLP instances mark them as pure_slp.  */
2848       slp = vect_make_slp_decision (loop_vinfo);
2849       if (slp)
2850         {
2851           /* Find stmts that need to be both vectorized and SLPed.  */
2852           vect_detect_hybrid_slp (loop_vinfo);
2853
2854           /* Update the vectorization factor based on the SLP decision.  */
2855           vect_update_vf_for_slp (loop_vinfo);
2856
2857           /* Optimize the SLP graph with the vectorization factor fixed.  */
2858           vect_optimize_slp (loop_vinfo);
2859
2860           /* Gather the loads reachable from the SLP graph entries.  */
2861           vect_gather_slp_loads (loop_vinfo);
2862         }
2863     }
2864
2865   bool saved_can_use_partial_vectors_p
2866     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2867
2868   /* We don't expect to have to roll back to anything other than an empty
2869      set of rgroups.  */
2870   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2871
2872   /* This is the point where we can re-start analysis with SLP forced off.  */
2873 start_over:
2874
2875   /* Apply the suggested unrolling factor, this was determined by the backend
2876      during finish_cost the first time we ran the analyzis for this
2877      vector mode.  */
2878   if (applying_suggested_uf)
2879     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2880
2881   /* Now the vectorization factor is final.  */
2882   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2883   gcc_assert (known_ne (vectorization_factor, 0U));
2884
2885   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2886     {
2887       dump_printf_loc (MSG_NOTE, vect_location,
2888                        "vectorization_factor = ");
2889       dump_dec (MSG_NOTE, vectorization_factor);
2890       dump_printf (MSG_NOTE, ", niters = %wd\n",
2891                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2892     }
2893
2894   if (max_vf != MAX_VECTORIZATION_FACTOR
2895       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2896     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2897
2898   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2899
2900   /* Analyze the alignment of the data-refs in the loop.
2901      Fail if a data reference is found that cannot be vectorized.  */
2902
2903   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2904   if (!ok)
2905     {
2906       if (dump_enabled_p ())
2907         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2908                          "bad data alignment.\n");
2909       return ok;
2910     }
2911
2912   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2913      It is important to call pruning after vect_analyze_data_ref_accesses,
2914      since we use grouping information gathered by interleaving analysis.  */
2915   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2916   if (!ok)
2917     return ok;
2918
2919   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2920      vectorization, since we do not want to add extra peeling or
2921      add versioning for alignment.  */
2922   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2923     /* This pass will decide on using loop versioning and/or loop peeling in
2924        order to enhance the alignment of data references in the loop.  */
2925     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2926   if (!ok)
2927     return ok;
2928
2929   if (slp)
2930     {
2931       /* Analyze operations in the SLP instances.  Note this may
2932          remove unsupported SLP instances which makes the above
2933          SLP kind detection invalid.  */
2934       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2935       vect_slp_analyze_operations (loop_vinfo);
2936       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2937         {
2938           ok = opt_result::failure_at (vect_location,
2939                                        "unsupported SLP instances\n");
2940           goto again;
2941         }
2942
2943       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2944       slp_tree load_node, slp_root;
2945       unsigned i, x;
2946       slp_instance instance;
2947       bool can_use_lanes = true;
2948       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2949         {
2950           slp_root = SLP_INSTANCE_TREE (instance);
2951           int group_size = SLP_TREE_LANES (slp_root);
2952           tree vectype = SLP_TREE_VECTYPE (slp_root);
2953           bool loads_permuted = false;
2954           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2955             {
2956               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2957                 continue;
2958               unsigned j;
2959               stmt_vec_info load_info;
2960               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2961                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2962                   {
2963                     loads_permuted = true;
2964                     break;
2965                   }
2966             }
2967
2968           /* If the loads and stores can be handled with load/store-lane
2969              instructions record it and move on to the next instance.  */
2970           if (loads_permuted
2971               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2972               && vect_store_lanes_supported (vectype, group_size, false)
2973                    != IFN_LAST)
2974             {
2975               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2976                 if (STMT_VINFO_GROUPED_ACCESS
2977                       (SLP_TREE_REPRESENTATIVE (load_node)))
2978                   {
2979                     stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2980                         (SLP_TREE_REPRESENTATIVE (load_node));
2981                     /* Use SLP for strided accesses (or if we can't
2982                        load-lanes).  */
2983                     if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2984                         || vect_load_lanes_supported
2985                              (STMT_VINFO_VECTYPE (stmt_vinfo),
2986                               DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
2987                       break;
2988                   }
2989
2990               can_use_lanes
2991                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2992
2993               if (can_use_lanes && dump_enabled_p ())
2994                 dump_printf_loc (MSG_NOTE, vect_location,
2995                                  "SLP instance %p can use load/store-lanes\n",
2996                                  (void *) instance);
2997             }
2998           else
2999             {
3000               can_use_lanes = false;
3001               break;
3002             }
3003         }
3004
3005       /* If all SLP instances can use load/store-lanes abort SLP and try again
3006          with SLP disabled.  */
3007       if (can_use_lanes)
3008         {
3009           ok = opt_result::failure_at (vect_location,
3010                                        "Built SLP cancelled: can use "
3011                                        "load/store-lanes\n");
3012           if (dump_enabled_p ())
3013             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3014                              "Built SLP cancelled: all SLP instances support "
3015                              "load/store-lanes\n");
3016           goto again;
3017         }
3018     }
3019
3020   /* Dissolve SLP-only groups.  */
3021   vect_dissolve_slp_only_groups (loop_vinfo);
3022
3023   /* Scan all the remaining operations in the loop that are not subject
3024      to SLP and make sure they are vectorizable.  */
3025   ok = vect_analyze_loop_operations (loop_vinfo);
3026   if (!ok)
3027     {
3028       if (dump_enabled_p ())
3029         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3030                          "bad operation or unsupported loop bound.\n");
3031       return ok;
3032     }
3033
3034   /* For now, we don't expect to mix both masking and length approaches for one
3035      loop, disable it if both are recorded.  */
3036   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3037       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3038       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3039     {
3040       if (dump_enabled_p ())
3041         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3042                          "can't vectorize a loop with partial vectors"
3043                          " because we don't expect to mix different"
3044                          " approaches with partial vectors for the"
3045                          " same loop.\n");
3046       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3047     }
3048
3049   /* If we still have the option of using partial vectors,
3050      check whether we can generate the necessary loop controls.  */
3051   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3052     {
3053       if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3054         {
3055           if (!vect_verify_full_masking (loop_vinfo)
3056               && !vect_verify_full_masking_avx512 (loop_vinfo))
3057             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3058         }
3059       else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3060         if (!vect_verify_loop_lens (loop_vinfo))
3061           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3062     }
3063
3064   /* If we're vectorizing a loop that uses length "controls" and
3065      can iterate more than once, we apply decrementing IV approach
3066      in loop control.  */
3067   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3068       && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3069       && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3070       && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3071            && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3072                         LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3073     LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3074
3075   /* If a loop uses length controls and has a decrementing loop control IV,
3076      we will normally pass that IV through a MIN_EXPR to calcaluate the
3077      basis for the length controls.  E.g. in a loop that processes one
3078      element per scalar iteration, the number of elements would be
3079      MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3080
3081      This MIN_EXPR approach allows us to use pointer IVs with an invariant
3082      step, since only the final iteration of the vector loop can have
3083      inactive lanes.
3084
3085      However, some targets have a dedicated instruction for calculating the
3086      preferred length, given the total number of elements that still need to
3087      be processed.  This is encapsulated in the SELECT_VL internal function.
3088
3089      If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3090      to determine the basis for the length controls.  However, unlike the
3091      MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3092      lanes inactive in any iteration of the vector loop, not just the last
3093      iteration.  This SELECT_VL approach therefore requires us to use pointer
3094      IVs with variable steps.
3095
3096      Once we've decided how many elements should be processed by one
3097      iteration of the vector loop, we need to populate the rgroup controls.
3098      If a loop has multiple rgroups, we need to make sure that those rgroups
3099      "line up" (that is, they must be consistent about which elements are
3100      active and which aren't).  This is done by vect_adjust_loop_lens_control.
3101
3102      In principle, it would be possible to use vect_adjust_loop_lens_control
3103      on either the result of a MIN_EXPR or the result of a SELECT_VL.
3104      However:
3105
3106      (1) In practice, it only makes sense to use SELECT_VL when a vector
3107          operation will be controlled directly by the result.  It is not
3108          worth using SELECT_VL if it would only be the input to other
3109          calculations.
3110
3111      (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3112          pointer IV will need N updates by a variable amount (N-1 updates
3113          within the iteration and 1 update to move to the next iteration).
3114
3115      Because of this, we prefer to use the MIN_EXPR approach whenever there
3116      is more than one length control.
3117
3118      In addition, SELECT_VL always operates to a granularity of 1 unit.
3119      If we wanted to use it to control an SLP operation on N consecutive
3120      elements, we would need to make the SELECT_VL inputs measure scalar
3121      iterations (rather than elements) and then multiply the SELECT_VL
3122      result by N.  But using SELECT_VL this way is inefficient because
3123      of (1) above.
3124
3125      2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3126         satisfied:
3127
3128      (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3129      (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3130
3131      Since SELECT_VL (variable step) will make SCEV analysis failed and then
3132      we will fail to gain benefits of following unroll optimizations. We prefer
3133      using the MIN_EXPR approach in this situation.  */
3134   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3135     {
3136       tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3137       if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3138                                           OPTIMIZE_FOR_SPEED)
3139           && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3140           && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3141           && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3142               || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3143         LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3144     }
3145
3146   /* Decide whether this loop_vinfo should use partial vectors or peeling,
3147      assuming that the loop will be used as a main loop.  We will redo
3148      this analysis later if we instead decide to use the loop as an
3149      epilogue loop.  */
3150   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3151   if (!ok)
3152     return ok;
3153
3154   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3155      to be able to handle fewer than VF scalars, or needs to have a lower VF
3156      than the main loop.  */
3157   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3158       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3159     {
3160       poly_uint64 unscaled_vf
3161         = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3162                      orig_loop_vinfo->suggested_unroll_factor);
3163       if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3164         return opt_result::failure_at (vect_location,
3165                                        "Vectorization factor too high for"
3166                                        " epilogue loop.\n");
3167     }
3168
3169   /* Check the costings of the loop make vectorizing worthwhile.  */
3170   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3171   if (res < 0)
3172     {
3173       ok = opt_result::failure_at (vect_location,
3174                                    "Loop costings may not be worthwhile.\n");
3175       goto again;
3176     }
3177   if (!res)
3178     return opt_result::failure_at (vect_location,
3179                                    "Loop costings not worthwhile.\n");
3180
3181   /* If an epilogue loop is required make sure we can create one.  */
3182   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3183       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
3184       || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3185     {
3186       if (dump_enabled_p ())
3187         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3188       if (!vect_can_advance_ivs_p (loop_vinfo)
3189           || !slpeel_can_duplicate_loop_p (loop,
3190                                            LOOP_VINFO_IV_EXIT (loop_vinfo),
3191                                            LOOP_VINFO_IV_EXIT (loop_vinfo)))
3192         {
3193           ok = opt_result::failure_at (vect_location,
3194                                        "not vectorized: can't create required "
3195                                        "epilog loop\n");
3196           goto again;
3197         }
3198     }
3199
3200   /* During peeling, we need to check if number of loop iterations is
3201      enough for both peeled prolog loop and vector loop.  This check
3202      can be merged along with threshold check of loop versioning, so
3203      increase threshold for this case if necessary.
3204
3205      If we are analyzing an epilogue we still want to check what its
3206      versioning threshold would be.  If we decide to vectorize the epilogues we
3207      will want to use the lowest versioning threshold of all epilogues and main
3208      loop.  This will enable us to enter a vectorized epilogue even when
3209      versioning the loop.  We can't simply check whether the epilogue requires
3210      versioning though since we may have skipped some versioning checks when
3211      analyzing the epilogue.  For instance, checks for alias versioning will be
3212      skipped when dealing with epilogues as we assume we already checked them
3213      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
3214   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3215     {
3216       poly_uint64 niters_th = 0;
3217       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3218
3219       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3220         {
3221           /* Niters for peeled prolog loop.  */
3222           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3223             {
3224               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3225               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3226               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3227             }
3228           else
3229             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3230         }
3231
3232       /* Niters for at least one iteration of vectorized loop.  */
3233       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3234         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3235       /* One additional iteration because of peeling for gap.  */
3236       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3237         niters_th += 1;
3238
3239       /*  Use the same condition as vect_transform_loop to decide when to use
3240           the cost to determine a versioning threshold.  */
3241       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3242           && ordered_p (th, niters_th))
3243         niters_th = ordered_max (poly_uint64 (th), niters_th);
3244
3245       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3246     }
3247
3248   gcc_assert (known_eq (vectorization_factor,
3249                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3250
3251   slp_done_for_suggested_uf = slp;
3252
3253   /* Ok to vectorize!  */
3254   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3255   return opt_result::success ();
3256
3257 again:
3258   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
3259   gcc_assert (!ok);
3260
3261   /* Try again with SLP forced off but if we didn't do any SLP there is
3262      no point in re-trying.  */
3263   if (!slp)
3264     return ok;
3265
3266   /* If the slp decision is true when suggested unroll factor is worked
3267      out, and we are applying suggested unroll factor, we don't need to
3268      re-try any more.  */
3269   if (applying_suggested_uf && slp_done_for_suggested_uf)
3270     return ok;
3271
3272   /* If there are reduction chains re-trying will fail anyway.  */
3273   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3274     return ok;
3275
3276   /* Likewise if the grouped loads or stores in the SLP cannot be handled
3277      via interleaving or lane instructions.  */
3278   slp_instance instance;
3279   slp_tree node;
3280   unsigned i, j;
3281   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3282     {
3283       stmt_vec_info vinfo;
3284       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3285       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3286         continue;
3287       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3288       unsigned int size = DR_GROUP_SIZE (vinfo);
3289       tree vectype = STMT_VINFO_VECTYPE (vinfo);
3290       if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3291          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3292          && ! vect_grouped_store_supported (vectype, size))
3293         return opt_result::failure_at (vinfo->stmt,
3294                                        "unsupported grouped store\n");
3295       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3296         {
3297           vinfo = SLP_TREE_REPRESENTATIVE (node);
3298           if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3299             {
3300               vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3301               bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3302               size = DR_GROUP_SIZE (vinfo);
3303               vectype = STMT_VINFO_VECTYPE (vinfo);
3304               if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3305                   && ! vect_grouped_load_supported (vectype, single_element_p,
3306                                                     size))
3307                 return opt_result::failure_at (vinfo->stmt,
3308                                                "unsupported grouped load\n");
3309             }
3310         }
3311     }
3312
3313   if (dump_enabled_p ())
3314     dump_printf_loc (MSG_NOTE, vect_location,
3315                      "re-trying with SLP disabled\n");
3316
3317   /* Roll back state appropriately.  No SLP this time.  */
3318   slp = false;
3319   /* Restore vectorization factor as it were without SLP.  */
3320   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3321   /* Free the SLP instances.  */
3322   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3323     vect_free_slp_instance (instance);
3324   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3325   /* Reset SLP type to loop_vect on all stmts.  */
3326   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3327     {
3328       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3329       for (gimple_stmt_iterator si = gsi_start_phis (bb);
3330            !gsi_end_p (si); gsi_next (&si))
3331         {
3332           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3333           STMT_SLP_TYPE (stmt_info) = loop_vect;
3334           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3335               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3336             {
3337               /* vectorizable_reduction adjusts reduction stmt def-types,
3338                  restore them to that of the PHI.  */
3339               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3340                 = STMT_VINFO_DEF_TYPE (stmt_info);
3341               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3342                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
3343                 = STMT_VINFO_DEF_TYPE (stmt_info);
3344             }
3345         }
3346       for (gimple_stmt_iterator si = gsi_start_bb (bb);
3347            !gsi_end_p (si); gsi_next (&si))
3348         {
3349           if (is_gimple_debug (gsi_stmt (si)))
3350             continue;
3351           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3352           STMT_SLP_TYPE (stmt_info) = loop_vect;
3353           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3354             {
3355               stmt_vec_info pattern_stmt_info
3356                 = STMT_VINFO_RELATED_STMT (stmt_info);
3357               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3358                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3359
3360               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3361               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3362               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3363                    !gsi_end_p (pi); gsi_next (&pi))
3364                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3365                   = loop_vect;
3366             }
3367         }
3368     }
3369   /* Free optimized alias test DDRS.  */
3370   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3371   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3372   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3373   /* Reset target cost data.  */
3374   delete loop_vinfo->vector_costs;
3375   loop_vinfo->vector_costs = nullptr;
3376   /* Reset accumulated rgroup information.  */
3377   LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3378   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3379   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3380   /* Reset assorted flags.  */
3381   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3382   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3383   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3384   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3385   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3386     = saved_can_use_partial_vectors_p;
3387
3388   goto start_over;
3389 }
3390
3391 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3392    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
3393    OLD_LOOP_VINFO is better unless something specifically indicates
3394    otherwise.
3395
3396    Note that this deliberately isn't a partial order.  */
3397
3398 static bool
3399 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3400                           loop_vec_info old_loop_vinfo)
3401 {
3402   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3403   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3404
3405   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3406   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3407
3408   /* Always prefer a VF of loop->simdlen over any other VF.  */
3409   if (loop->simdlen)
3410     {
3411       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3412       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3413       if (new_simdlen_p != old_simdlen_p)
3414         return new_simdlen_p;
3415     }
3416
3417   const auto *old_costs = old_loop_vinfo->vector_costs;
3418   const auto *new_costs = new_loop_vinfo->vector_costs;
3419   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3420     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3421
3422   return new_costs->better_main_loop_than_p (old_costs);
3423 }
3424
3425 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
3426    true if we should.  */
3427
3428 static bool
3429 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3430                         loop_vec_info old_loop_vinfo)
3431 {
3432   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3433     return false;
3434
3435   if (dump_enabled_p ())
3436     dump_printf_loc (MSG_NOTE, vect_location,
3437                      "***** Preferring vector mode %s to vector mode %s\n",
3438                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
3439                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
3440   return true;
3441 }
3442
3443 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3444    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3445    MODE_I to the next mode useful to analyze.
3446    Return the loop_vinfo on success and wrapped null on failure.  */
3447
3448 static opt_loop_vec_info
3449 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3450                      const vect_loop_form_info *loop_form_info,
3451                      loop_vec_info main_loop_vinfo,
3452                      const vector_modes &vector_modes, unsigned &mode_i,
3453                      machine_mode &autodetected_vector_mode,
3454                      bool &fatal)
3455 {
3456   loop_vec_info loop_vinfo
3457     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3458
3459   machine_mode vector_mode = vector_modes[mode_i];
3460   loop_vinfo->vector_mode = vector_mode;
3461   unsigned int suggested_unroll_factor = 1;
3462   bool slp_done_for_suggested_uf = false;
3463
3464   /* Run the main analysis.  */
3465   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3466                                         &suggested_unroll_factor,
3467                                         slp_done_for_suggested_uf);
3468   if (dump_enabled_p ())
3469     dump_printf_loc (MSG_NOTE, vect_location,
3470                      "***** Analysis %s with vector mode %s\n",
3471                      res ? "succeeded" : " failed",
3472                      GET_MODE_NAME (loop_vinfo->vector_mode));
3473
3474   if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3475     {
3476       if (dump_enabled_p ())
3477         dump_printf_loc (MSG_NOTE, vect_location,
3478                          "***** Re-trying analysis for unrolling"
3479                          " with unroll factor %d and slp %s.\n",
3480                          suggested_unroll_factor,
3481                          slp_done_for_suggested_uf ? "on" : "off");
3482       loop_vec_info unroll_vinfo
3483         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3484       unroll_vinfo->vector_mode = vector_mode;
3485       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3486       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3487                                                 slp_done_for_suggested_uf);
3488       if (new_res)
3489         {
3490           delete loop_vinfo;
3491           loop_vinfo = unroll_vinfo;
3492         }
3493       else
3494         delete unroll_vinfo;
3495     }
3496
3497   /* Remember the autodetected vector mode.  */
3498   if (vector_mode == VOIDmode)
3499     autodetected_vector_mode = loop_vinfo->vector_mode;
3500
3501   /* Advance mode_i, first skipping modes that would result in the
3502      same analysis result.  */
3503   while (mode_i + 1 < vector_modes.length ()
3504          && vect_chooses_same_modes_p (loop_vinfo,
3505                                        vector_modes[mode_i + 1]))
3506     {
3507       if (dump_enabled_p ())
3508         dump_printf_loc (MSG_NOTE, vect_location,
3509                          "***** The result for vector mode %s would"
3510                          " be the same\n",
3511                          GET_MODE_NAME (vector_modes[mode_i + 1]));
3512       mode_i += 1;
3513     }
3514   if (mode_i + 1 < vector_modes.length ()
3515       && VECTOR_MODE_P (autodetected_vector_mode)
3516       && (related_vector_mode (vector_modes[mode_i + 1],
3517                                GET_MODE_INNER (autodetected_vector_mode))
3518           == autodetected_vector_mode)
3519       && (related_vector_mode (autodetected_vector_mode,
3520                                GET_MODE_INNER (vector_modes[mode_i + 1]))
3521           == vector_modes[mode_i + 1]))
3522     {
3523       if (dump_enabled_p ())
3524         dump_printf_loc (MSG_NOTE, vect_location,
3525                          "***** Skipping vector mode %s, which would"
3526                          " repeat the analysis for %s\n",
3527                          GET_MODE_NAME (vector_modes[mode_i + 1]),
3528                          GET_MODE_NAME (autodetected_vector_mode));
3529       mode_i += 1;
3530     }
3531   mode_i++;
3532
3533   if (!res)
3534     {
3535       delete loop_vinfo;
3536       if (fatal)
3537         gcc_checking_assert (main_loop_vinfo == NULL);
3538       return opt_loop_vec_info::propagate_failure (res);
3539     }
3540
3541   return opt_loop_vec_info::success (loop_vinfo);
3542 }
3543
3544 /* Function vect_analyze_loop.
3545
3546    Apply a set of analyses on LOOP, and create a loop_vec_info struct
3547    for it.  The different analyses will record information in the
3548    loop_vec_info struct.  */
3549 opt_loop_vec_info
3550 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3551 {
3552   DUMP_VECT_SCOPE ("analyze_loop_nest");
3553
3554   if (loop_outer (loop)
3555       && loop_vec_info_for_loop (loop_outer (loop))
3556       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3557     return opt_loop_vec_info::failure_at (vect_location,
3558                                           "outer-loop already vectorized.\n");
3559
3560   if (!find_loop_nest (loop, &shared->loop_nest))
3561     return opt_loop_vec_info::failure_at
3562       (vect_location,
3563        "not vectorized: loop nest containing two or more consecutive inner"
3564        " loops cannot be vectorized\n");
3565
3566   /* Analyze the loop form.  */
3567   vect_loop_form_info loop_form_info;
3568   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3569   if (!res)
3570     {
3571       if (dump_enabled_p ())
3572         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3573                          "bad loop form.\n");
3574       return opt_loop_vec_info::propagate_failure (res);
3575     }
3576   if (!integer_onep (loop_form_info.assumptions))
3577     {
3578       /* We consider to vectorize this loop by versioning it under
3579          some assumptions.  In order to do this, we need to clear
3580          existing information computed by scev and niter analyzer.  */
3581       scev_reset_htab ();
3582       free_numbers_of_iterations_estimates (loop);
3583       /* Also set flag for this loop so that following scev and niter
3584          analysis are done under the assumptions.  */
3585       loop_constraint_set (loop, LOOP_C_FINITE);
3586     }
3587   else
3588     /* Clear the existing niter information to make sure the nonwrapping flag
3589        will be calculated and set propriately.  */
3590     free_numbers_of_iterations_estimates (loop);
3591
3592   auto_vector_modes vector_modes;
3593   /* Autodetect first vector size we try.  */
3594   vector_modes.safe_push (VOIDmode);
3595   unsigned int autovec_flags
3596     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3597                                                     loop->simdlen != 0);
3598   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3599                              && !unlimited_cost_model (loop));
3600   machine_mode autodetected_vector_mode = VOIDmode;
3601   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3602   unsigned int mode_i = 0;
3603   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3604
3605   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3606      a mode has not been analyzed.  */
3607   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3608   for (unsigned i = 0; i < vector_modes.length (); ++i)
3609     cached_vf_per_mode.safe_push (0);
3610
3611   /* First determine the main loop vectorization mode, either the first
3612      one that works, starting with auto-detecting the vector mode and then
3613      following the targets order of preference, or the one with the
3614      lowest cost if pick_lowest_cost_p.  */
3615   while (1)
3616     {
3617       bool fatal;
3618       unsigned int last_mode_i = mode_i;
3619       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3620          failed.  */
3621       cached_vf_per_mode[last_mode_i] = -1;
3622       opt_loop_vec_info loop_vinfo
3623         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3624                                NULL, vector_modes, mode_i,
3625                                autodetected_vector_mode, fatal);
3626       if (fatal)
3627         break;
3628
3629       if (loop_vinfo)
3630         {
3631           /*  Analyzis has been successful so update the VF value.  The
3632               VF should always be a multiple of unroll_factor and we want to
3633               capture the original VF here.  */
3634           cached_vf_per_mode[last_mode_i]
3635             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3636                          loop_vinfo->suggested_unroll_factor);
3637           /* Once we hit the desired simdlen for the first time,
3638              discard any previous attempts.  */
3639           if (simdlen
3640               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3641             {
3642               delete first_loop_vinfo;
3643               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3644               simdlen = 0;
3645             }
3646           else if (pick_lowest_cost_p
3647                    && first_loop_vinfo
3648                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3649             {
3650               /* Pick loop_vinfo over first_loop_vinfo.  */
3651               delete first_loop_vinfo;
3652               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3653             }
3654           if (first_loop_vinfo == NULL)
3655             first_loop_vinfo = loop_vinfo;
3656           else
3657             {
3658               delete loop_vinfo;
3659               loop_vinfo = opt_loop_vec_info::success (NULL);
3660             }
3661
3662           /* Commit to first_loop_vinfo if we have no reason to try
3663              alternatives.  */
3664           if (!simdlen && !pick_lowest_cost_p)
3665             break;
3666         }
3667       if (mode_i == vector_modes.length ()
3668           || autodetected_vector_mode == VOIDmode)
3669         break;
3670
3671       /* Try the next biggest vector size.  */
3672       if (dump_enabled_p ())
3673         dump_printf_loc (MSG_NOTE, vect_location,
3674                          "***** Re-trying analysis with vector mode %s\n",
3675                          GET_MODE_NAME (vector_modes[mode_i]));
3676     }
3677   if (!first_loop_vinfo)
3678     return opt_loop_vec_info::propagate_failure (res);
3679
3680   if (dump_enabled_p ())
3681     dump_printf_loc (MSG_NOTE, vect_location,
3682                      "***** Choosing vector mode %s\n",
3683                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3684
3685   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3686      enabled, SIMDUID is not set, it is the innermost loop and we have
3687      either already found the loop's SIMDLEN or there was no SIMDLEN to
3688      begin with.
3689      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3690   bool vect_epilogues = (!simdlen
3691                          && loop->inner == NULL
3692                          && param_vect_epilogues_nomask
3693                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3694                            /* No code motion support for multiple epilogues so for now
3695                               not supported when multiple exits.  */
3696                          && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3697                          && !loop->simduid);
3698   if (!vect_epilogues)
3699     return first_loop_vinfo;
3700
3701   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3702   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3703
3704   /* For epilogues start the analysis from the first mode.  The motivation
3705      behind starting from the beginning comes from cases where the VECTOR_MODES
3706      array may contain length-agnostic and length-specific modes.  Their
3707      ordering is not guaranteed, so we could end up picking a mode for the main
3708      loop that is after the epilogue's optimal mode.  */
3709   vector_modes[0] = autodetected_vector_mode;
3710   mode_i = 0;
3711
3712   bool supports_partial_vectors =
3713     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3714   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3715
3716   while (1)
3717     {
3718       /* If the target does not support partial vectors we can shorten the
3719          number of modes to analyze for the epilogue as we know we can't pick a
3720          mode that would lead to a VF at least as big as the
3721          FIRST_VINFO_VF.  */
3722       if (!supports_partial_vectors
3723           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3724         {
3725           mode_i++;
3726           if (mode_i == vector_modes.length ())
3727             break;
3728           continue;
3729         }
3730
3731       if (dump_enabled_p ())
3732         dump_printf_loc (MSG_NOTE, vect_location,
3733                          "***** Re-trying epilogue analysis with vector "
3734                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3735
3736       bool fatal;
3737       opt_loop_vec_info loop_vinfo
3738         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3739                                first_loop_vinfo,
3740                                vector_modes, mode_i,
3741                                autodetected_vector_mode, fatal);
3742       if (fatal)
3743         break;
3744
3745       if (loop_vinfo)
3746         {
3747           if (pick_lowest_cost_p)
3748             {
3749               /* Keep trying to roll back vectorization attempts while the
3750                  loop_vec_infos they produced were worse than this one.  */
3751               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3752               while (!vinfos.is_empty ()
3753                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3754                 {
3755                   gcc_assert (vect_epilogues);
3756                   delete vinfos.pop ();
3757                 }
3758             }
3759           /* For now only allow one epilogue loop.  */
3760           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3761             {
3762               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3763               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3764               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3765                           || maybe_ne (lowest_th, 0U));
3766               /* Keep track of the known smallest versioning
3767                  threshold.  */
3768               if (ordered_p (lowest_th, th))
3769                 lowest_th = ordered_min (lowest_th, th);
3770             }
3771           else
3772             {
3773               delete loop_vinfo;
3774               loop_vinfo = opt_loop_vec_info::success (NULL);
3775             }
3776
3777           /* For now only allow one epilogue loop, but allow
3778              pick_lowest_cost_p to replace it, so commit to the
3779              first epilogue if we have no reason to try alternatives.  */
3780           if (!pick_lowest_cost_p)
3781             break;
3782         }
3783
3784       if (mode_i == vector_modes.length ())
3785         break;
3786
3787     }
3788
3789   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3790     {
3791       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3792       if (dump_enabled_p ())
3793         dump_printf_loc (MSG_NOTE, vect_location,
3794                          "***** Choosing epilogue vector mode %s\n",
3795                          GET_MODE_NAME
3796                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3797     }
3798
3799   return first_loop_vinfo;
3800 }
3801
3802 /* Return true if there is an in-order reduction function for CODE, storing
3803    it in *REDUC_FN if so.  */
3804
3805 static bool
3806 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3807 {
3808   /* We support MINUS_EXPR by negating the operand.  This also preserves an
3809      initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3810      (-0.0) = -0.0.  */
3811   if (code == PLUS_EXPR || code == MINUS_EXPR)
3812     {
3813       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3814       return true;
3815     }
3816   return false;
3817 }
3818
3819 /* Function reduction_fn_for_scalar_code
3820
3821    Input:
3822    CODE - tree_code of a reduction operations.
3823
3824    Output:
3825    REDUC_FN - the corresponding internal function to be used to reduce the
3826       vector of partial results into a single scalar result, or IFN_LAST
3827       if the operation is a supported reduction operation, but does not have
3828       such an internal function.
3829
3830    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3831
3832 bool
3833 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3834 {
3835   if (code.is_tree_code ())
3836     switch (tree_code (code))
3837       {
3838       case MAX_EXPR:
3839         *reduc_fn = IFN_REDUC_MAX;
3840         return true;
3841
3842       case MIN_EXPR:
3843         *reduc_fn = IFN_REDUC_MIN;
3844         return true;
3845
3846       case PLUS_EXPR:
3847         *reduc_fn = IFN_REDUC_PLUS;
3848         return true;
3849
3850       case BIT_AND_EXPR:
3851         *reduc_fn = IFN_REDUC_AND;
3852         return true;
3853
3854       case BIT_IOR_EXPR:
3855         *reduc_fn = IFN_REDUC_IOR;
3856         return true;
3857
3858       case BIT_XOR_EXPR:
3859         *reduc_fn = IFN_REDUC_XOR;
3860         return true;
3861
3862       case MULT_EXPR:
3863       case MINUS_EXPR:
3864         *reduc_fn = IFN_LAST;
3865         return true;
3866
3867       default:
3868         return false;
3869       }
3870   else
3871     switch (combined_fn (code))
3872       {
3873       CASE_CFN_FMAX:
3874         *reduc_fn = IFN_REDUC_FMAX;
3875         return true;
3876
3877       CASE_CFN_FMIN:
3878         *reduc_fn = IFN_REDUC_FMIN;
3879         return true;
3880
3881       default:
3882         return false;
3883       }
3884 }
3885
3886 /* If there is a neutral value X such that a reduction would not be affected
3887    by the introduction of additional X elements, return that X, otherwise
3888    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3889    of the scalar elements.  If the reduction has just a single initial value
3890    then INITIAL_VALUE is that value, otherwise it is null.
3891    If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3892    In that case no signed zero is returned.  */
3893
3894 tree
3895 neutral_op_for_reduction (tree scalar_type, code_helper code,
3896                           tree initial_value, bool as_initial)
3897 {
3898   if (code.is_tree_code ())
3899     switch (tree_code (code))
3900       {
3901       case DOT_PROD_EXPR:
3902       case SAD_EXPR:
3903       case MINUS_EXPR:
3904       case BIT_IOR_EXPR:
3905       case BIT_XOR_EXPR:
3906         return build_zero_cst (scalar_type);
3907       case WIDEN_SUM_EXPR:
3908       case PLUS_EXPR:
3909         if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3910           return build_real (scalar_type, dconstm0);
3911         else
3912           return build_zero_cst (scalar_type);
3913
3914       case MULT_EXPR:
3915         return build_one_cst (scalar_type);
3916
3917       case BIT_AND_EXPR:
3918         return build_all_ones_cst (scalar_type);
3919
3920       case MAX_EXPR:
3921       case MIN_EXPR:
3922         return initial_value;
3923
3924       default:
3925         return NULL_TREE;
3926       }
3927   else
3928     switch (combined_fn (code))
3929       {
3930       CASE_CFN_FMIN:
3931       CASE_CFN_FMAX:
3932         return initial_value;
3933
3934       default:
3935         return NULL_TREE;
3936       }
3937 }
3938
3939 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3940    STMT is printed with a message MSG. */
3941
3942 static void
3943 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3944 {
3945   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3946 }
3947
3948 /* Return true if we need an in-order reduction for operation CODE
3949    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3950    overflow must wrap.  */
3951
3952 bool
3953 needs_fold_left_reduction_p (tree type, code_helper code)
3954 {
3955   /* CHECKME: check for !flag_finite_math_only too?  */
3956   if (SCALAR_FLOAT_TYPE_P (type))
3957     {
3958       if (code.is_tree_code ())
3959         switch (tree_code (code))
3960           {
3961           case MIN_EXPR:
3962           case MAX_EXPR:
3963             return false;
3964
3965           default:
3966             return !flag_associative_math;
3967           }
3968       else
3969         switch (combined_fn (code))
3970           {
3971           CASE_CFN_FMIN:
3972           CASE_CFN_FMAX:
3973             return false;
3974
3975           default:
3976             return !flag_associative_math;
3977           }
3978     }
3979
3980   if (INTEGRAL_TYPE_P (type))
3981     return (!code.is_tree_code ()
3982             || !operation_no_trapping_overflow (type, tree_code (code)));
3983
3984   if (SAT_FIXED_POINT_TYPE_P (type))
3985     return true;
3986
3987   return false;
3988 }
3989
3990 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3991    has a handled computation expression.  Store the main reduction
3992    operation in *CODE.  */
3993
3994 static bool
3995 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3996                       tree loop_arg, code_helper *code,
3997                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3998 {
3999   auto_bitmap visited;
4000   tree lookfor = PHI_RESULT (phi);
4001   ssa_op_iter curri;
4002   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
4003   while (USE_FROM_PTR (curr) != loop_arg)
4004     curr = op_iter_next_use (&curri);
4005   curri.i = curri.numops;
4006   do
4007     {
4008       path.safe_push (std::make_pair (curri, curr));
4009       tree use = USE_FROM_PTR (curr);
4010       if (use == lookfor)
4011         break;
4012       gimple *def = SSA_NAME_DEF_STMT (use);
4013       if (gimple_nop_p (def)
4014           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
4015         {
4016 pop:
4017           do
4018             {
4019               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
4020               curri = x.first;
4021               curr = x.second;
4022               do
4023                 curr = op_iter_next_use (&curri);
4024               /* Skip already visited or non-SSA operands (from iterating
4025                  over PHI args).  */
4026               while (curr != NULL_USE_OPERAND_P
4027                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4028                          || ! bitmap_set_bit (visited,
4029                                               SSA_NAME_VERSION
4030                                                 (USE_FROM_PTR (curr)))));
4031             }
4032           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
4033           if (curr == NULL_USE_OPERAND_P)
4034             break;
4035         }
4036       else
4037         {
4038           if (gimple_code (def) == GIMPLE_PHI)
4039             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4040           else
4041             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4042           while (curr != NULL_USE_OPERAND_P
4043                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4044                      || ! bitmap_set_bit (visited,
4045                                           SSA_NAME_VERSION
4046                                             (USE_FROM_PTR (curr)))))
4047             curr = op_iter_next_use (&curri);
4048           if (curr == NULL_USE_OPERAND_P)
4049             goto pop;
4050         }
4051     }
4052   while (1);
4053   if (dump_file && (dump_flags & TDF_DETAILS))
4054     {
4055       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4056       unsigned i;
4057       std::pair<ssa_op_iter, use_operand_p> *x;
4058       FOR_EACH_VEC_ELT (path, i, x)
4059         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4060       dump_printf (MSG_NOTE, "\n");
4061     }
4062
4063   /* Check whether the reduction path detected is valid.  */
4064   bool fail = path.length () == 0;
4065   bool neg = false;
4066   int sign = -1;
4067   *code = ERROR_MARK;
4068   for (unsigned i = 1; i < path.length (); ++i)
4069     {
4070       gimple *use_stmt = USE_STMT (path[i].second);
4071       gimple_match_op op;
4072       if (!gimple_extract_op (use_stmt, &op))
4073         {
4074           fail = true;
4075           break;
4076         }
4077       unsigned int opi = op.num_ops;
4078       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4079         {
4080           /* The following make sure we can compute the operand index
4081              easily plus it mostly disallows chaining via COND_EXPR condition
4082              operands.  */
4083           for (opi = 0; opi < op.num_ops; ++opi)
4084             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4085               break;
4086         }
4087       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4088         {
4089           for (opi = 0; opi < op.num_ops; ++opi)
4090             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4091               break;
4092         }
4093       if (opi == op.num_ops)
4094         {
4095           fail = true;
4096           break;
4097         }
4098       op.code = canonicalize_code (op.code, op.type);
4099       if (op.code == MINUS_EXPR)
4100         {
4101           op.code = PLUS_EXPR;
4102           /* Track whether we negate the reduction value each iteration.  */
4103           if (op.ops[1] == op.ops[opi])
4104             neg = ! neg;
4105         }
4106       if (CONVERT_EXPR_CODE_P (op.code)
4107           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4108         ;
4109       else if (*code == ERROR_MARK)
4110         {
4111           *code = op.code;
4112           sign = TYPE_SIGN (op.type);
4113         }
4114       else if (op.code != *code)
4115         {
4116           fail = true;
4117           break;
4118         }
4119       else if ((op.code == MIN_EXPR
4120                 || op.code == MAX_EXPR)
4121                && sign != TYPE_SIGN (op.type))
4122         {
4123           fail = true;
4124           break;
4125         }
4126       /* Check there's only a single stmt the op is used on.  For the
4127          not value-changing tail and the last stmt allow out-of-loop uses.
4128          ???  We could relax this and handle arbitrary live stmts by
4129          forcing a scalar epilogue for example.  */
4130       imm_use_iterator imm_iter;
4131       use_operand_p use_p;
4132       gimple *op_use_stmt;
4133       unsigned cnt = 0;
4134       bool cond_fn_p = op.code.is_internal_fn ()
4135         && (conditional_internal_fn_code (internal_fn (op.code))
4136             != ERROR_MARK);
4137
4138       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4139         {
4140         /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4141            op1 twice (once as definition, once as else) in the same operation.
4142            Allow this.  */
4143           if (cond_fn_p && op_use_stmt == use_stmt)
4144             {
4145               gcall *call = as_a<gcall *> (use_stmt);
4146               unsigned else_pos
4147                 = internal_fn_else_index (internal_fn (op.code));
4148
4149               for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4150                 {
4151                   if (j == else_pos)
4152                     continue;
4153                   if (gimple_call_arg (call, j) == op.ops[opi])
4154                     cnt++;
4155                 }
4156             }
4157           else if (!is_gimple_debug (op_use_stmt)
4158                    && (*code != ERROR_MARK
4159                        || flow_bb_inside_loop_p (loop,
4160                                                  gimple_bb (op_use_stmt))))
4161             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4162               cnt++;
4163         }
4164
4165       if (cnt != 1)
4166         {
4167           fail = true;
4168           break;
4169         }
4170     }
4171   return ! fail && ! neg && *code != ERROR_MARK;
4172 }
4173
4174 bool
4175 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4176                       tree loop_arg, enum tree_code code)
4177 {
4178   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4179   code_helper code_;
4180   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4181           && code_ == code);
4182 }
4183
4184
4185
4186 /* Function vect_is_simple_reduction
4187
4188    (1) Detect a cross-iteration def-use cycle that represents a simple
4189    reduction computation.  We look for the following pattern:
4190
4191    loop_header:
4192      a1 = phi < a0, a2 >
4193      a3 = ...
4194      a2 = operation (a3, a1)
4195
4196    or
4197
4198    a3 = ...
4199    loop_header:
4200      a1 = phi < a0, a2 >
4201      a2 = operation (a3, a1)
4202
4203    such that:
4204    1. operation is commutative and associative and it is safe to
4205       change the order of the computation
4206    2. no uses for a2 in the loop (a2 is used out of the loop)
4207    3. no uses of a1 in the loop besides the reduction operation
4208    4. no uses of a1 outside the loop.
4209
4210    Conditions 1,4 are tested here.
4211    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4212
4213    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4214    nested cycles.
4215
4216    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4217    reductions:
4218
4219      a1 = phi < a0, a2 >
4220      inner loop (def of a3)
4221      a2 = phi < a3 >
4222
4223    (4) Detect condition expressions, ie:
4224      for (int i = 0; i < N; i++)
4225        if (a[i] < val)
4226         ret_val = a[i];
4227
4228 */
4229
4230 static stmt_vec_info
4231 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4232                           bool *double_reduc, bool *reduc_chain_p, bool slp)
4233 {
4234   gphi *phi = as_a <gphi *> (phi_info->stmt);
4235   gimple *phi_use_stmt = NULL;
4236   imm_use_iterator imm_iter;
4237   use_operand_p use_p;
4238
4239   *double_reduc = false;
4240   *reduc_chain_p = false;
4241   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4242
4243   tree phi_name = PHI_RESULT (phi);
4244   /* ???  If there are no uses of the PHI result the inner loop reduction
4245      won't be detected as possibly double-reduction by vectorizable_reduction
4246      because that tries to walk the PHI arg from the preheader edge which
4247      can be constant.  See PR60382.  */
4248   if (has_zero_uses (phi_name))
4249     return NULL;
4250   class loop *loop = (gimple_bb (phi))->loop_father;
4251   unsigned nphi_def_loop_uses = 0;
4252   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4253     {
4254       gimple *use_stmt = USE_STMT (use_p);
4255       if (is_gimple_debug (use_stmt))
4256         continue;
4257
4258       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4259         {
4260           if (dump_enabled_p ())
4261             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4262                              "intermediate value used outside loop.\n");
4263
4264           return NULL;
4265         }
4266
4267       /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4268          op1 twice (once as definition, once as else) in the same operation.
4269          Only count it as one. */
4270       if (use_stmt != phi_use_stmt)
4271         {
4272           nphi_def_loop_uses++;
4273           phi_use_stmt = use_stmt;
4274         }
4275     }
4276
4277   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4278   if (TREE_CODE (latch_def) != SSA_NAME)
4279     {
4280       if (dump_enabled_p ())
4281         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4282                          "reduction: not ssa_name: %T\n", latch_def);
4283       return NULL;
4284     }
4285
4286   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4287   if (!def_stmt_info
4288       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4289     return NULL;
4290
4291   bool nested_in_vect_loop
4292     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4293   unsigned nlatch_def_loop_uses = 0;
4294   auto_vec<gphi *, 3> lcphis;
4295   bool inner_loop_of_double_reduc = false;
4296   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4297     {
4298       gimple *use_stmt = USE_STMT (use_p);
4299       if (is_gimple_debug (use_stmt))
4300         continue;
4301       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4302         nlatch_def_loop_uses++;
4303       else
4304         {
4305           /* We can have more than one loop-closed PHI.  */
4306           lcphis.safe_push (as_a <gphi *> (use_stmt));
4307           if (nested_in_vect_loop
4308               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4309                   == vect_double_reduction_def))
4310             inner_loop_of_double_reduc = true;
4311         }
4312     }
4313
4314   /* If we are vectorizing an inner reduction we are executing that
4315      in the original order only in case we are not dealing with a
4316      double reduction.  */
4317   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4318     {
4319       if (dump_enabled_p ())
4320         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4321                         "detected nested cycle: ");
4322       return def_stmt_info;
4323     }
4324
4325   /* When the inner loop of a double reduction ends up with more than
4326      one loop-closed PHI we have failed to classify alternate such
4327      PHIs as double reduction, leading to wrong code.  See PR103237.  */
4328   if (inner_loop_of_double_reduc && lcphis.length () != 1)
4329     {
4330       if (dump_enabled_p ())
4331         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4332                          "unhandle double reduction\n");
4333       return NULL;
4334     }
4335
4336   /* If this isn't a nested cycle or if the nested cycle reduction value
4337      is used ouside of the inner loop we cannot handle uses of the reduction
4338      value.  */
4339   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4340     {
4341       if (dump_enabled_p ())
4342         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4343                          "reduction used in loop.\n");
4344       return NULL;
4345     }
4346
4347   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4348      defined in the inner loop.  */
4349   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4350     {
4351       tree op1 = PHI_ARG_DEF (def_stmt, 0);
4352       if (gimple_phi_num_args (def_stmt) != 1
4353           || TREE_CODE (op1) != SSA_NAME)
4354         {
4355           if (dump_enabled_p ())
4356             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4357                              "unsupported phi node definition.\n");
4358
4359           return NULL;
4360         }
4361
4362       /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4363          and the latch definition op1.  */
4364       gimple *def1 = SSA_NAME_DEF_STMT (op1);
4365       if (gimple_bb (def1)
4366           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4367           && loop->inner
4368           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4369           && (is_gimple_assign (def1) || is_gimple_call (def1))
4370           && is_a <gphi *> (phi_use_stmt)
4371           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4372           && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4373                                             loop_latch_edge (loop->inner))))
4374         {
4375           if (dump_enabled_p ())
4376             report_vect_op (MSG_NOTE, def_stmt,
4377                             "detected double reduction: ");
4378
4379           *double_reduc = true;
4380           return def_stmt_info;
4381         }
4382
4383       return NULL;
4384     }
4385
4386   /* Look for the expression computing latch_def from then loop PHI result.  */
4387   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4388   code_helper code;
4389   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4390                             path))
4391     {
4392       STMT_VINFO_REDUC_CODE (phi_info) = code;
4393       if (code == COND_EXPR && !nested_in_vect_loop)
4394         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4395
4396       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4397          reduction chain for which the additional restriction is that
4398          all operations in the chain are the same.  */
4399       auto_vec<stmt_vec_info, 8> reduc_chain;
4400       unsigned i;
4401       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4402       for (i = path.length () - 1; i >= 1; --i)
4403         {
4404           gimple *stmt = USE_STMT (path[i].second);
4405           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4406           gimple_match_op op;
4407           if (!gimple_extract_op (stmt, &op))
4408             gcc_unreachable ();
4409           if (gassign *assign = dyn_cast<gassign *> (stmt))
4410             STMT_VINFO_REDUC_IDX (stmt_info)
4411               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4412           else
4413             {
4414               gcall *call = as_a<gcall *> (stmt);
4415               STMT_VINFO_REDUC_IDX (stmt_info)
4416                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4417             }
4418           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4419                                      && (i == 1 || i == path.length () - 1));
4420           if ((op.code != code && !leading_conversion)
4421               /* We can only handle the final value in epilogue
4422                  generation for reduction chains.  */
4423               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4424             is_slp_reduc = false;
4425           /* For reduction chains we support a trailing/leading
4426              conversions.  We do not store those in the actual chain.  */
4427           if (leading_conversion)
4428             continue;
4429           reduc_chain.safe_push (stmt_info);
4430         }
4431       if (slp && is_slp_reduc && reduc_chain.length () > 1)
4432         {
4433           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4434             {
4435               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4436               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4437             }
4438           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4439           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4440
4441           /* Save the chain for further analysis in SLP detection.  */
4442           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4443           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4444
4445           *reduc_chain_p = true;
4446           if (dump_enabled_p ())
4447             dump_printf_loc (MSG_NOTE, vect_location,
4448                             "reduction: detected reduction chain\n");
4449         }
4450       else if (dump_enabled_p ())
4451         dump_printf_loc (MSG_NOTE, vect_location,
4452                          "reduction: detected reduction\n");
4453
4454       return def_stmt_info;
4455     }
4456
4457   if (dump_enabled_p ())
4458     dump_printf_loc (MSG_NOTE, vect_location,
4459                      "reduction: unknown pattern\n");
4460
4461   return NULL;
4462 }
4463
4464 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4465    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4466    or -1 if not known.  */
4467
4468 static int
4469 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4470 {
4471   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4472   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4473     {
4474       if (dump_enabled_p ())
4475         dump_printf_loc (MSG_NOTE, vect_location,
4476                          "cost model: epilogue peel iters set to vf/2 "
4477                          "because loop iterations are unknown .\n");
4478       return assumed_vf / 2;
4479     }
4480   else
4481     {
4482       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4483       peel_iters_prologue = MIN (niters, peel_iters_prologue);
4484       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4485       /* If we need to peel for gaps, but no peeling is required, we have to
4486          peel VF iterations.  */
4487       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4488         peel_iters_epilogue = assumed_vf;
4489       return peel_iters_epilogue;
4490     }
4491 }
4492
4493 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
4494 int
4495 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4496                              int *peel_iters_epilogue,
4497                              stmt_vector_for_cost *scalar_cost_vec,
4498                              stmt_vector_for_cost *prologue_cost_vec,
4499                              stmt_vector_for_cost *epilogue_cost_vec)
4500 {
4501   int retval = 0;
4502
4503   *peel_iters_epilogue
4504     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4505
4506   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4507     {
4508       /* If peeled iterations are known but number of scalar loop
4509          iterations are unknown, count a taken branch per peeled loop.  */
4510       if (peel_iters_prologue > 0)
4511         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4512                                    vect_prologue);
4513       if (*peel_iters_epilogue > 0)
4514         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4515                                     vect_epilogue);
4516     }
4517
4518   stmt_info_for_cost *si;
4519   int j;
4520   if (peel_iters_prologue)
4521     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4522       retval += record_stmt_cost (prologue_cost_vec,
4523                                   si->count * peel_iters_prologue,
4524                                   si->kind, si->stmt_info, si->misalign,
4525                                   vect_prologue);
4526   if (*peel_iters_epilogue)
4527     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4528       retval += record_stmt_cost (epilogue_cost_vec,
4529                                   si->count * *peel_iters_epilogue,
4530                                   si->kind, si->stmt_info, si->misalign,
4531                                   vect_epilogue);
4532
4533   return retval;
4534 }
4535
4536 /* Function vect_estimate_min_profitable_iters
4537
4538    Return the number of iterations required for the vector version of the
4539    loop to be profitable relative to the cost of the scalar version of the
4540    loop.
4541
4542    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4543    of iterations for vectorization.  -1 value means loop vectorization
4544    is not profitable.  This returned value may be used for dynamic
4545    profitability check.
4546
4547    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4548    for static check against estimated number of iterations.  */
4549
4550 static void
4551 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4552                                     int *ret_min_profitable_niters,
4553                                     int *ret_min_profitable_estimate,
4554                                     unsigned *suggested_unroll_factor)
4555 {
4556   int min_profitable_iters;
4557   int min_profitable_estimate;
4558   int peel_iters_prologue;
4559   int peel_iters_epilogue;
4560   unsigned vec_inside_cost = 0;
4561   int vec_outside_cost = 0;
4562   unsigned vec_prologue_cost = 0;
4563   unsigned vec_epilogue_cost = 0;
4564   int scalar_single_iter_cost = 0;
4565   int scalar_outside_cost = 0;
4566   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4567   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4568   vector_costs *target_cost_data = loop_vinfo->vector_costs;
4569
4570   /* Cost model disabled.  */
4571   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4572     {
4573       if (dump_enabled_p ())
4574         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4575       *ret_min_profitable_niters = 0;
4576       *ret_min_profitable_estimate = 0;
4577       return;
4578     }
4579
4580   /* Requires loop versioning tests to handle misalignment.  */
4581   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4582     {
4583       /*  FIXME: Make cost depend on complexity of individual check.  */
4584       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4585       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4586       if (dump_enabled_p ())
4587         dump_printf (MSG_NOTE,
4588                      "cost model: Adding cost of checks for loop "
4589                      "versioning to treat misalignment.\n");
4590     }
4591
4592   /* Requires loop versioning with alias checks.  */
4593   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4594     {
4595       /*  FIXME: Make cost depend on complexity of individual check.  */
4596       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4597       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4598       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4599       if (len)
4600         /* Count LEN - 1 ANDs and LEN comparisons.  */
4601         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4602                               scalar_stmt, vect_prologue);
4603       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4604       if (len)
4605         {
4606           /* Count LEN - 1 ANDs and LEN comparisons.  */
4607           unsigned int nstmts = len * 2 - 1;
4608           /* +1 for each bias that needs adding.  */
4609           for (unsigned int i = 0; i < len; ++i)
4610             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4611               nstmts += 1;
4612           (void) add_stmt_cost (target_cost_data, nstmts,
4613                                 scalar_stmt, vect_prologue);
4614         }
4615       if (dump_enabled_p ())
4616         dump_printf (MSG_NOTE,
4617                      "cost model: Adding cost of checks for loop "
4618                      "versioning aliasing.\n");
4619     }
4620
4621   /* Requires loop versioning with niter checks.  */
4622   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4623     {
4624       /*  FIXME: Make cost depend on complexity of individual check.  */
4625       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4626                             NULL, NULL, NULL_TREE, 0, vect_prologue);
4627       if (dump_enabled_p ())
4628         dump_printf (MSG_NOTE,
4629                      "cost model: Adding cost of checks for loop "
4630                      "versioning niters.\n");
4631     }
4632
4633   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4634     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4635                           vect_prologue);
4636
4637   /* Count statements in scalar loop.  Using this as scalar cost for a single
4638      iteration for now.
4639
4640      TODO: Add outer loop support.
4641
4642      TODO: Consider assigning different costs to different scalar
4643      statements.  */
4644
4645   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4646
4647   /* Add additional cost for the peeled instructions in prologue and epilogue
4648      loop.  (For fully-masked loops there will be no peeling.)
4649
4650      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4651      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4652
4653      TODO: Build an expression that represents peel_iters for prologue and
4654      epilogue to be used in a run-time test.  */
4655
4656   bool prologue_need_br_taken_cost = false;
4657   bool prologue_need_br_not_taken_cost = false;
4658
4659   /* Calculate peel_iters_prologue.  */
4660   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4661     peel_iters_prologue = 0;
4662   else if (npeel < 0)
4663     {
4664       peel_iters_prologue = assumed_vf / 2;
4665       if (dump_enabled_p ())
4666         dump_printf (MSG_NOTE, "cost model: "
4667                      "prologue peel iters set to vf/2.\n");
4668
4669       /* If peeled iterations are unknown, count a taken branch and a not taken
4670          branch per peeled loop.  Even if scalar loop iterations are known,
4671          vector iterations are not known since peeled prologue iterations are
4672          not known.  Hence guards remain the same.  */
4673       prologue_need_br_taken_cost = true;
4674       prologue_need_br_not_taken_cost = true;
4675     }
4676   else
4677     {
4678       peel_iters_prologue = npeel;
4679       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4680         /* If peeled iterations are known but number of scalar loop
4681            iterations are unknown, count a taken branch per peeled loop.  */
4682         prologue_need_br_taken_cost = true;
4683     }
4684
4685   bool epilogue_need_br_taken_cost = false;
4686   bool epilogue_need_br_not_taken_cost = false;
4687
4688   /* Calculate peel_iters_epilogue.  */
4689   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4690     /* We need to peel exactly one iteration for gaps.  */
4691     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4692   else if (npeel < 0)
4693     {
4694       /* If peeling for alignment is unknown, loop bound of main loop
4695          becomes unknown.  */
4696       peel_iters_epilogue = assumed_vf / 2;
4697       if (dump_enabled_p ())
4698         dump_printf (MSG_NOTE, "cost model: "
4699                      "epilogue peel iters set to vf/2 because "
4700                      "peeling for alignment is unknown.\n");
4701
4702       /* See the same reason above in peel_iters_prologue calculation.  */
4703       epilogue_need_br_taken_cost = true;
4704       epilogue_need_br_not_taken_cost = true;
4705     }
4706   else
4707     {
4708       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4709       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4710         /* If peeled iterations are known but number of scalar loop
4711            iterations are unknown, count a taken branch per peeled loop.  */
4712         epilogue_need_br_taken_cost = true;
4713     }
4714
4715   stmt_info_for_cost *si;
4716   int j;
4717   /* Add costs associated with peel_iters_prologue.  */
4718   if (peel_iters_prologue)
4719     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4720       {
4721         (void) add_stmt_cost (target_cost_data,
4722                               si->count * peel_iters_prologue, si->kind,
4723                               si->stmt_info, si->node, si->vectype,
4724                               si->misalign, vect_prologue);
4725       }
4726
4727   /* Add costs associated with peel_iters_epilogue.  */
4728   if (peel_iters_epilogue)
4729     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4730       {
4731         (void) add_stmt_cost (target_cost_data,
4732                               si->count * peel_iters_epilogue, si->kind,
4733                               si->stmt_info, si->node, si->vectype,
4734                               si->misalign, vect_epilogue);
4735       }
4736
4737   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4738
4739   if (prologue_need_br_taken_cost)
4740     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4741                           vect_prologue);
4742
4743   if (prologue_need_br_not_taken_cost)
4744     (void) add_stmt_cost (target_cost_data, 1,
4745                           cond_branch_not_taken, vect_prologue);
4746
4747   if (epilogue_need_br_taken_cost)
4748     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4749                           vect_epilogue);
4750
4751   if (epilogue_need_br_not_taken_cost)
4752     (void) add_stmt_cost (target_cost_data, 1,
4753                           cond_branch_not_taken, vect_epilogue);
4754
4755   /* Take care of special costs for rgroup controls of partial vectors.  */
4756   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4757       && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4758           == vect_partial_vectors_avx512))
4759     {
4760       /* Calculate how many masks we need to generate.  */
4761       unsigned int num_masks = 0;
4762       bool need_saturation = false;
4763       for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4764         if (rgm.type)
4765           {
4766             unsigned nvectors = rgm.factor;
4767             num_masks += nvectors;
4768             if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4769                 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4770               need_saturation = true;
4771           }
4772
4773       /* ???  The target isn't able to identify the costs below as
4774          producing masks so it cannot penaltize cases where we'd run
4775          out of mask registers for example.  */
4776
4777       /* ???  We are also failing to account for smaller vector masks
4778          we generate by splitting larger masks in vect_get_loop_mask.  */
4779
4780       /* In the worst case, we need to generate each mask in the prologue
4781          and in the loop body.  We need one splat per group and one
4782          compare per mask.
4783
4784          Sometimes the prologue mask will fold to a constant,
4785          so the actual prologue cost might be smaller.  However, it's
4786          simpler and safer to use the worst-case cost; if this ends up
4787          being the tie-breaker between vectorizing or not, then it's
4788          probably better not to vectorize.  */
4789       (void) add_stmt_cost (target_cost_data,
4790                             num_masks
4791                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4792                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4793                             vect_prologue);
4794       (void) add_stmt_cost (target_cost_data,
4795                             num_masks
4796                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4797                             vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4798
4799       /* When we need saturation we need it both in the prologue and
4800          the epilogue.  */
4801       if (need_saturation)
4802         {
4803           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4804                                 NULL, NULL, NULL_TREE, 0, vect_prologue);
4805           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4806                                 NULL, NULL, NULL_TREE, 0, vect_body);
4807         }
4808     }
4809   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4810            && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4811                == vect_partial_vectors_while_ult))
4812     {
4813       /* Calculate how many masks we need to generate.  */
4814       unsigned int num_masks = 0;
4815       rgroup_controls *rgm;
4816       unsigned int num_vectors_m1;
4817       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4818                         num_vectors_m1, rgm)
4819         if (rgm->type)
4820           num_masks += num_vectors_m1 + 1;
4821       gcc_assert (num_masks > 0);
4822
4823       /* In the worst case, we need to generate each mask in the prologue
4824          and in the loop body.  One of the loop body mask instructions
4825          replaces the comparison in the scalar loop, and since we don't
4826          count the scalar comparison against the scalar body, we shouldn't
4827          count that vector instruction against the vector body either.
4828
4829          Sometimes we can use unpacks instead of generating prologue
4830          masks and sometimes the prologue mask will fold to a constant,
4831          so the actual prologue cost might be smaller.  However, it's
4832          simpler and safer to use the worst-case cost; if this ends up
4833          being the tie-breaker between vectorizing or not, then it's
4834          probably better not to vectorize.  */
4835       (void) add_stmt_cost (target_cost_data, num_masks,
4836                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4837                             vect_prologue);
4838       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4839                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4840                             vect_body);
4841     }
4842   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4843     {
4844       /* Referring to the functions vect_set_loop_condition_partial_vectors
4845          and vect_set_loop_controls_directly, we need to generate each
4846          length in the prologue and in the loop body if required. Although
4847          there are some possible optimizations, we consider the worst case
4848          here.  */
4849
4850       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4851       signed char partial_load_store_bias
4852         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4853       bool need_iterate_p
4854         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4855            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4856
4857       /* Calculate how many statements to be added.  */
4858       unsigned int prologue_stmts = 0;
4859       unsigned int body_stmts = 0;
4860
4861       rgroup_controls *rgc;
4862       unsigned int num_vectors_m1;
4863       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4864         if (rgc->type)
4865           {
4866             /* May need one SHIFT for nitems_total computation.  */
4867             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4868             if (nitems != 1 && !niters_known_p)
4869               prologue_stmts += 1;
4870
4871             /* May need one MAX and one MINUS for wrap around.  */
4872             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4873               prologue_stmts += 2;
4874
4875             /* Need one MAX and one MINUS for each batch limit excepting for
4876                the 1st one.  */
4877             prologue_stmts += num_vectors_m1 * 2;
4878
4879             unsigned int num_vectors = num_vectors_m1 + 1;
4880
4881             /* Need to set up lengths in prologue, only one MIN required
4882                for each since start index is zero.  */
4883             prologue_stmts += num_vectors;
4884
4885             /* If we have a non-zero partial load bias, we need one PLUS
4886                to adjust the load length.  */
4887             if (partial_load_store_bias != 0)
4888               body_stmts += 1;
4889
4890             unsigned int length_update_cost = 0;
4891             if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4892               /* For decrement IV style, Each only need a single SELECT_VL
4893                  or MIN since beginning to calculate the number of elements
4894                  need to be processed in current iteration.  */
4895               length_update_cost = 1;
4896             else
4897               /* For increment IV stype, Each may need two MINs and one MINUS to
4898                  update lengths in body for next iteration.  */
4899               length_update_cost = 3;
4900
4901             if (need_iterate_p)
4902               body_stmts += length_update_cost * num_vectors;
4903           }
4904
4905       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4906                             scalar_stmt, vect_prologue);
4907       (void) add_stmt_cost (target_cost_data, body_stmts,
4908                             scalar_stmt, vect_body);
4909     }
4910
4911   /* FORNOW: The scalar outside cost is incremented in one of the
4912      following ways:
4913
4914      1. The vectorizer checks for alignment and aliasing and generates
4915      a condition that allows dynamic vectorization.  A cost model
4916      check is ANDED with the versioning condition.  Hence scalar code
4917      path now has the added cost of the versioning check.
4918
4919        if (cost > th & versioning_check)
4920          jmp to vector code
4921
4922      Hence run-time scalar is incremented by not-taken branch cost.
4923
4924      2. The vectorizer then checks if a prologue is required.  If the
4925      cost model check was not done before during versioning, it has to
4926      be done before the prologue check.
4927
4928        if (cost <= th)
4929          prologue = scalar_iters
4930        if (prologue == 0)
4931          jmp to vector code
4932        else
4933          execute prologue
4934        if (prologue == num_iters)
4935          go to exit
4936
4937      Hence the run-time scalar cost is incremented by a taken branch,
4938      plus a not-taken branch, plus a taken branch cost.
4939
4940      3. The vectorizer then checks if an epilogue is required.  If the
4941      cost model check was not done before during prologue check, it
4942      has to be done with the epilogue check.
4943
4944        if (prologue == 0)
4945          jmp to vector code
4946        else
4947          execute prologue
4948        if (prologue == num_iters)
4949          go to exit
4950        vector code:
4951          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4952            jmp to epilogue
4953
4954      Hence the run-time scalar cost should be incremented by 2 taken
4955      branches.
4956
4957      TODO: The back end may reorder the BBS's differently and reverse
4958      conditions/branch directions.  Change the estimates below to
4959      something more reasonable.  */
4960
4961   /* If the number of iterations is known and we do not do versioning, we can
4962      decide whether to vectorize at compile time.  Hence the scalar version
4963      do not carry cost model guard costs.  */
4964   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4965       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4966     {
4967       /* Cost model check occurs at versioning.  */
4968       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4969         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4970       else
4971         {
4972           /* Cost model check occurs at prologue generation.  */
4973           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4974             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4975               + vect_get_stmt_cost (cond_branch_not_taken);
4976           /* Cost model check occurs at epilogue generation.  */
4977           else
4978             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4979         }
4980     }
4981
4982   /* Complete the target-specific cost calculations.  */
4983   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4984                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4985                suggested_unroll_factor);
4986
4987   if (suggested_unroll_factor && *suggested_unroll_factor > 1
4988       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4989       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4990                     *suggested_unroll_factor,
4991                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4992     {
4993       if (dump_enabled_p ())
4994         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4995                          "can't unroll as unrolled vectorization factor larger"
4996                          " than maximum vectorization factor: "
4997                          HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4998                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4999       *suggested_unroll_factor = 1;
5000     }
5001
5002   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
5003
5004   if (dump_enabled_p ())
5005     {
5006       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5007       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
5008                    vec_inside_cost);
5009       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
5010                    vec_prologue_cost);
5011       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
5012                    vec_epilogue_cost);
5013       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
5014                    scalar_single_iter_cost);
5015       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
5016                    scalar_outside_cost);
5017       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
5018                    vec_outside_cost);
5019       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
5020                    peel_iters_prologue);
5021       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
5022                    peel_iters_epilogue);
5023     }
5024
5025   /* Calculate number of iterations required to make the vector version
5026      profitable, relative to the loop bodies only.  The following condition
5027      must hold true:
5028      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
5029      where
5030      SIC = scalar iteration cost, VIC = vector iteration cost,
5031      VOC = vector outside cost, VF = vectorization factor,
5032      NPEEL = prologue iterations + epilogue iterations,
5033      SOC = scalar outside cost for run time cost model check.  */
5034
5035   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
5036                           - vec_inside_cost);
5037   if (saving_per_viter <= 0)
5038     {
5039       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
5040         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
5041                     "vectorization did not happen for a simd loop");
5042
5043       if (dump_enabled_p ())
5044         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5045                          "cost model: the vector iteration cost = %d "
5046                          "divided by the scalar iteration cost = %d "
5047                          "is greater or equal to the vectorization factor = %d"
5048                          ".\n",
5049                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5050       *ret_min_profitable_niters = -1;
5051       *ret_min_profitable_estimate = -1;
5052       return;
5053     }
5054
5055   /* ??? The "if" arm is written to handle all cases; see below for what
5056      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5057   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5058     {
5059       /* Rewriting the condition above in terms of the number of
5060          vector iterations (vniters) rather than the number of
5061          scalar iterations (niters) gives:
5062
5063          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5064
5065          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5066
5067          For integer N, X and Y when X > 0:
5068
5069          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
5070       int outside_overhead = (vec_outside_cost
5071                               - scalar_single_iter_cost * peel_iters_prologue
5072                               - scalar_single_iter_cost * peel_iters_epilogue
5073                               - scalar_outside_cost);
5074       /* We're only interested in cases that require at least one
5075          vector iteration.  */
5076       int min_vec_niters = 1;
5077       if (outside_overhead > 0)
5078         min_vec_niters = outside_overhead / saving_per_viter + 1;
5079
5080       if (dump_enabled_p ())
5081         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
5082                      min_vec_niters);
5083
5084       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5085         {
5086           /* Now that we know the minimum number of vector iterations,
5087              find the minimum niters for which the scalar cost is larger:
5088
5089              SIC * niters > VIC * vniters + VOC - SOC
5090
5091              We know that the minimum niters is no more than
5092              vniters * VF + NPEEL, but it might be (and often is) less
5093              than that if a partial vector iteration is cheaper than the
5094              equivalent scalar code.  */
5095           int threshold = (vec_inside_cost * min_vec_niters
5096                            + vec_outside_cost
5097                            - scalar_outside_cost);
5098           if (threshold <= 0)
5099             min_profitable_iters = 1;
5100           else
5101             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5102         }
5103       else
5104         /* Convert the number of vector iterations into a number of
5105            scalar iterations.  */
5106         min_profitable_iters = (min_vec_niters * assumed_vf
5107                                 + peel_iters_prologue
5108                                 + peel_iters_epilogue);
5109     }
5110   else
5111     {
5112       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5113                               * assumed_vf
5114                               - vec_inside_cost * peel_iters_prologue
5115                               - vec_inside_cost * peel_iters_epilogue);
5116       if (min_profitable_iters <= 0)
5117         min_profitable_iters = 0;
5118       else
5119         {
5120           min_profitable_iters /= saving_per_viter;
5121
5122           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5123               <= (((int) vec_inside_cost * min_profitable_iters)
5124                   + (((int) vec_outside_cost - scalar_outside_cost)
5125                      * assumed_vf)))
5126             min_profitable_iters++;
5127         }
5128     }
5129
5130   if (dump_enabled_p ())
5131     dump_printf (MSG_NOTE,
5132                  "  Calculated minimum iters for profitability: %d\n",
5133                  min_profitable_iters);
5134
5135   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5136       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5137     /* We want the vectorized loop to execute at least once.  */
5138     min_profitable_iters = assumed_vf + peel_iters_prologue;
5139   else if (min_profitable_iters < peel_iters_prologue)
5140     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5141        vectorized loop executes at least once.  */
5142     min_profitable_iters = peel_iters_prologue;
5143
5144   if (dump_enabled_p ())
5145     dump_printf_loc (MSG_NOTE, vect_location,
5146                      "  Runtime profitability threshold = %d\n",
5147                      min_profitable_iters);
5148
5149   *ret_min_profitable_niters = min_profitable_iters;
5150
5151   /* Calculate number of iterations required to make the vector version
5152      profitable, relative to the loop bodies only.
5153
5154      Non-vectorized variant is SIC * niters and it must win over vector
5155      variant on the expected loop trip count.  The following condition must hold true:
5156      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
5157
5158   if (vec_outside_cost <= 0)
5159     min_profitable_estimate = 0;
5160   /* ??? This "else if" arm is written to handle all cases; see below for
5161      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5162   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5163     {
5164       /* This is a repeat of the code above, but with + SOC rather
5165          than - SOC.  */
5166       int outside_overhead = (vec_outside_cost
5167                               - scalar_single_iter_cost * peel_iters_prologue
5168                               - scalar_single_iter_cost * peel_iters_epilogue
5169                               + scalar_outside_cost);
5170       int min_vec_niters = 1;
5171       if (outside_overhead > 0)
5172         min_vec_niters = outside_overhead / saving_per_viter + 1;
5173
5174       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5175         {
5176           int threshold = (vec_inside_cost * min_vec_niters
5177                            + vec_outside_cost
5178                            + scalar_outside_cost);
5179           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5180         }
5181       else
5182         min_profitable_estimate = (min_vec_niters * assumed_vf
5183                                    + peel_iters_prologue
5184                                    + peel_iters_epilogue);
5185     }
5186   else
5187     {
5188       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5189                                  * assumed_vf
5190                                  - vec_inside_cost * peel_iters_prologue
5191                                  - vec_inside_cost * peel_iters_epilogue)
5192                                  / ((scalar_single_iter_cost * assumed_vf)
5193                                    - vec_inside_cost);
5194     }
5195   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5196   if (dump_enabled_p ())
5197     dump_printf_loc (MSG_NOTE, vect_location,
5198                      "  Static estimate profitability threshold = %d\n",
5199                      min_profitable_estimate);
5200
5201   *ret_min_profitable_estimate = min_profitable_estimate;
5202 }
5203
5204 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5205    vector elements (not bits) for a vector with NELT elements.  */
5206 static void
5207 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5208                               vec_perm_builder *sel)
5209 {
5210   /* The encoding is a single stepped pattern.  Any wrap-around is handled
5211      by vec_perm_indices.  */
5212   sel->new_vector (nelt, 1, 3);
5213   for (unsigned int i = 0; i < 3; i++)
5214     sel->quick_push (i + offset);
5215 }
5216
5217 /* Checks whether the target supports whole-vector shifts for vectors of mode
5218    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
5219    it supports vec_perm_const with masks for all necessary shift amounts.  */
5220 static bool
5221 have_whole_vector_shift (machine_mode mode)
5222 {
5223   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5224     return true;
5225
5226   /* Variable-length vectors should be handled via the optab.  */
5227   unsigned int nelt;
5228   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5229     return false;
5230
5231   vec_perm_builder sel;
5232   vec_perm_indices indices;
5233   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5234     {
5235       calc_vec_perm_mask_for_shift (i, nelt, &sel);
5236       indices.new_vector (sel, 2, nelt);
5237       if (!can_vec_perm_const_p (mode, mode, indices, false))
5238         return false;
5239     }
5240   return true;
5241 }
5242
5243 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5244    multiplication operands have differing signs and (b) we intend
5245    to emulate the operation using a series of signed DOT_PROD_EXPRs.
5246    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
5247
5248 static bool
5249 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5250                                  stmt_vec_info stmt_info)
5251 {
5252   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5253   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5254     return false;
5255
5256   tree rhs1 = gimple_assign_rhs1 (assign);
5257   tree rhs2 = gimple_assign_rhs2 (assign);
5258   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5259     return false;
5260
5261   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5262   gcc_assert (reduc_info->is_reduc_info);
5263   return !directly_supported_p (DOT_PROD_EXPR,
5264                                 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5265                                 optab_vector_mixed_sign);
5266 }
5267
5268 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5269    functions. Design better to avoid maintenance issues.  */
5270
5271 /* Function vect_model_reduction_cost.
5272
5273    Models cost for a reduction operation, including the vector ops
5274    generated within the strip-mine loop in some cases, the initial
5275    definition before the loop, and the epilogue code that must be generated.  */
5276
5277 static void
5278 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5279                            stmt_vec_info stmt_info, internal_fn reduc_fn,
5280                            vect_reduction_type reduction_type,
5281                            int ncopies, stmt_vector_for_cost *cost_vec)
5282 {
5283   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5284   tree vectype;
5285   machine_mode mode;
5286   class loop *loop = NULL;
5287
5288   if (loop_vinfo)
5289     loop = LOOP_VINFO_LOOP (loop_vinfo);
5290
5291   /* Condition reductions generate two reductions in the loop.  */
5292   if (reduction_type == COND_REDUCTION)
5293     ncopies *= 2;
5294
5295   vectype = STMT_VINFO_VECTYPE (stmt_info);
5296   mode = TYPE_MODE (vectype);
5297   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5298
5299   gimple_match_op op;
5300   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5301     gcc_unreachable ();
5302
5303   bool emulated_mixed_dot_prod
5304     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5305   if (reduction_type == EXTRACT_LAST_REDUCTION)
5306     /* No extra instructions are needed in the prologue.  The loop body
5307        operations are costed in vectorizable_condition.  */
5308     inside_cost = 0;
5309   else if (reduction_type == FOLD_LEFT_REDUCTION)
5310     {
5311       /* No extra instructions needed in the prologue.  */
5312       prologue_cost = 0;
5313
5314       if (reduc_fn != IFN_LAST)
5315         /* Count one reduction-like operation per vector.  */
5316         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5317                                         stmt_info, 0, vect_body);
5318       else
5319         {
5320           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
5321           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5322           inside_cost = record_stmt_cost (cost_vec, nelements,
5323                                           vec_to_scalar, stmt_info, 0,
5324                                           vect_body);
5325           inside_cost += record_stmt_cost (cost_vec, nelements,
5326                                            scalar_stmt, stmt_info, 0,
5327                                            vect_body);
5328         }
5329     }
5330   else
5331     {
5332       /* Add in the cost of the initial definitions.  */
5333       int prologue_stmts;
5334       if (reduction_type == COND_REDUCTION)
5335         /* For cond reductions we have four vectors: initial index, step,
5336            initial result of the data reduction, initial value of the index
5337            reduction.  */
5338         prologue_stmts = 4;
5339       else if (emulated_mixed_dot_prod)
5340         /* We need the initial reduction value and two invariants:
5341            one that contains the minimum signed value and one that
5342            contains half of its negative.  */
5343         prologue_stmts = 3;
5344       else
5345         prologue_stmts = 1;
5346       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5347                                          scalar_to_vec, stmt_info, 0,
5348                                          vect_prologue);
5349     }
5350
5351   /* Determine cost of epilogue code.
5352
5353      We have a reduction operator that will reduce the vector in one statement.
5354      Also requires scalar extract.  */
5355
5356   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5357     {
5358       if (reduc_fn != IFN_LAST)
5359         {
5360           if (reduction_type == COND_REDUCTION)
5361             {
5362               /* An EQ stmt and an COND_EXPR stmt.  */
5363               epilogue_cost += record_stmt_cost (cost_vec, 2,
5364                                                  vector_stmt, stmt_info, 0,
5365                                                  vect_epilogue);
5366               /* Reduction of the max index and a reduction of the found
5367                  values.  */
5368               epilogue_cost += record_stmt_cost (cost_vec, 2,
5369                                                  vec_to_scalar, stmt_info, 0,
5370                                                  vect_epilogue);
5371               /* A broadcast of the max value.  */
5372               epilogue_cost += record_stmt_cost (cost_vec, 1,
5373                                                  scalar_to_vec, stmt_info, 0,
5374                                                  vect_epilogue);
5375             }
5376           else
5377             {
5378               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5379                                                  stmt_info, 0, vect_epilogue);
5380               epilogue_cost += record_stmt_cost (cost_vec, 1,
5381                                                  vec_to_scalar, stmt_info, 0,
5382                                                  vect_epilogue);
5383             }
5384         }
5385       else if (reduction_type == COND_REDUCTION)
5386         {
5387           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5388           /* Extraction of scalar elements.  */
5389           epilogue_cost += record_stmt_cost (cost_vec,
5390                                              2 * estimated_nunits,
5391                                              vec_to_scalar, stmt_info, 0,
5392                                              vect_epilogue);
5393           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
5394           epilogue_cost += record_stmt_cost (cost_vec,
5395                                              2 * estimated_nunits - 3,
5396                                              scalar_stmt, stmt_info, 0,
5397                                              vect_epilogue);
5398         }
5399       else if (reduction_type == EXTRACT_LAST_REDUCTION
5400                || reduction_type == FOLD_LEFT_REDUCTION)
5401         /* No extra instructions need in the epilogue.  */
5402         ;
5403       else
5404         {
5405           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5406           tree bitsize = TYPE_SIZE (op.type);
5407           int element_bitsize = tree_to_uhwi (bitsize);
5408           int nelements = vec_size_in_bits / element_bitsize;
5409
5410           if (op.code == COND_EXPR)
5411             op.code = MAX_EXPR;
5412
5413           /* We have a whole vector shift available.  */
5414           if (VECTOR_MODE_P (mode)
5415               && directly_supported_p (op.code, vectype)
5416               && have_whole_vector_shift (mode))
5417             {
5418               /* Final reduction via vector shifts and the reduction operator.
5419                  Also requires scalar extract.  */
5420               epilogue_cost += record_stmt_cost (cost_vec,
5421                                                  exact_log2 (nelements) * 2,
5422                                                  vector_stmt, stmt_info, 0,
5423                                                  vect_epilogue);
5424               epilogue_cost += record_stmt_cost (cost_vec, 1,
5425                                                  vec_to_scalar, stmt_info, 0,
5426                                                  vect_epilogue);
5427             }
5428           else
5429             /* Use extracts and reduction op for final reduction.  For N
5430                elements, we have N extracts and N-1 reduction ops.  */
5431             epilogue_cost += record_stmt_cost (cost_vec,
5432                                                nelements + nelements - 1,
5433                                                vector_stmt, stmt_info, 0,
5434                                                vect_epilogue);
5435         }
5436     }
5437
5438   if (dump_enabled_p ())
5439     dump_printf (MSG_NOTE,
5440                  "vect_model_reduction_cost: inside_cost = %d, "
5441                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5442                  prologue_cost, epilogue_cost);
5443 }
5444
5445 /* SEQ is a sequence of instructions that initialize the reduction
5446    described by REDUC_INFO.  Emit them in the appropriate place.  */
5447
5448 static void
5449 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5450                                 stmt_vec_info reduc_info, gimple *seq)
5451 {
5452   if (reduc_info->reused_accumulator)
5453     {
5454       /* When reusing an accumulator from the main loop, we only need
5455          initialization instructions if the main loop can be skipped.
5456          In that case, emit the initialization instructions at the end
5457          of the guard block that does the skip.  */
5458       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5459       gcc_assert (skip_edge);
5460       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5461       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5462     }
5463   else
5464     {
5465       /* The normal case: emit the initialization instructions on the
5466          preheader edge.  */
5467       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5468       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5469     }
5470 }
5471
5472 /* Function get_initial_def_for_reduction
5473
5474    Input:
5475    REDUC_INFO - the info_for_reduction
5476    INIT_VAL - the initial value of the reduction variable
5477    NEUTRAL_OP - a value that has no effect on the reduction, as per
5478                 neutral_op_for_reduction
5479
5480    Output:
5481    Return a vector variable, initialized according to the operation that
5482         STMT_VINFO performs. This vector will be used as the initial value
5483         of the vector of partial results.
5484
5485    The value we need is a vector in which element 0 has value INIT_VAL
5486    and every other element has value NEUTRAL_OP.  */
5487
5488 static tree
5489 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5490                                stmt_vec_info reduc_info,
5491                                tree init_val, tree neutral_op)
5492 {
5493   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5494   tree scalar_type = TREE_TYPE (init_val);
5495   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5496   tree init_def;
5497   gimple_seq stmts = NULL;
5498
5499   gcc_assert (vectype);
5500
5501   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5502               || SCALAR_FLOAT_TYPE_P (scalar_type));
5503
5504   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5505               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5506
5507   if (operand_equal_p (init_val, neutral_op))
5508     {
5509       /* If both elements are equal then the vector described above is
5510          just a splat.  */
5511       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5512       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5513     }
5514   else
5515     {
5516       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5517       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5518       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5519         {
5520           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5521              element 0.  */
5522           init_def = gimple_build_vector_from_val (&stmts, vectype,
5523                                                    neutral_op);
5524           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5525                                    vectype, init_def, init_val);
5526         }
5527       else
5528         {
5529           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
5530           tree_vector_builder elts (vectype, 1, 2);
5531           elts.quick_push (init_val);
5532           elts.quick_push (neutral_op);
5533           init_def = gimple_build_vector (&stmts, &elts);
5534         }
5535     }
5536
5537   if (stmts)
5538     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5539   return init_def;
5540 }
5541
5542 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5543    which performs a reduction involving GROUP_SIZE scalar statements.
5544    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
5545    is nonnull, introducing extra elements of that value will not change the
5546    result.  */
5547
5548 static void
5549 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5550                                 stmt_vec_info reduc_info,
5551                                 vec<tree> *vec_oprnds,
5552                                 unsigned int number_of_vectors,
5553                                 unsigned int group_size, tree neutral_op)
5554 {
5555   vec<tree> &initial_values = reduc_info->reduc_initial_values;
5556   unsigned HOST_WIDE_INT nunits;
5557   unsigned j, number_of_places_left_in_vector;
5558   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5559   unsigned int i;
5560
5561   gcc_assert (group_size == initial_values.length () || neutral_op);
5562
5563   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5564      created vectors. It is greater than 1 if unrolling is performed.
5565
5566      For example, we have two scalar operands, s1 and s2 (e.g., group of
5567      strided accesses of size two), while NUNITS is four (i.e., four scalars
5568      of this type can be packed in a vector).  The output vector will contain
5569      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
5570      will be 2).
5571
5572      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5573      vectors containing the operands.
5574
5575      For example, NUNITS is four as before, and the group size is 8
5576      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
5577      {s5, s6, s7, s8}.  */
5578
5579   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5580     nunits = group_size;
5581
5582   number_of_places_left_in_vector = nunits;
5583   bool constant_p = true;
5584   tree_vector_builder elts (vector_type, nunits, 1);
5585   elts.quick_grow (nunits);
5586   gimple_seq ctor_seq = NULL;
5587   for (j = 0; j < nunits * number_of_vectors; ++j)
5588     {
5589       tree op;
5590       i = j % group_size;
5591
5592       /* Get the def before the loop.  In reduction chain we have only
5593          one initial value.  Else we have as many as PHIs in the group.  */
5594       if (i >= initial_values.length () || (j > i && neutral_op))
5595         op = neutral_op;
5596       else
5597         op = initial_values[i];
5598
5599       /* Create 'vect_ = {op0,op1,...,opn}'.  */
5600       number_of_places_left_in_vector--;
5601       elts[nunits - number_of_places_left_in_vector - 1] = op;
5602       if (!CONSTANT_CLASS_P (op))
5603         constant_p = false;
5604
5605       if (number_of_places_left_in_vector == 0)
5606         {
5607           tree init;
5608           if (constant_p && !neutral_op
5609               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5610               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5611             /* Build the vector directly from ELTS.  */
5612             init = gimple_build_vector (&ctor_seq, &elts);
5613           else if (neutral_op)
5614             {
5615               /* Build a vector of the neutral value and shift the
5616                  other elements into place.  */
5617               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5618                                                    neutral_op);
5619               int k = nunits;
5620               while (k > 0 && elts[k - 1] == neutral_op)
5621                 k -= 1;
5622               while (k > 0)
5623                 {
5624                   k -= 1;
5625                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5626                                        vector_type, init, elts[k]);
5627                 }
5628             }
5629           else
5630             {
5631               /* First time round, duplicate ELTS to fill the
5632                  required number of vectors.  */
5633               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5634                                         elts, number_of_vectors, *vec_oprnds);
5635               break;
5636             }
5637           vec_oprnds->quick_push (init);
5638
5639           number_of_places_left_in_vector = nunits;
5640           elts.new_vector (vector_type, nunits, 1);
5641           elts.quick_grow (nunits);
5642           constant_p = true;
5643         }
5644     }
5645   if (ctor_seq != NULL)
5646     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5647 }
5648
5649 /* For a statement STMT_INFO taking part in a reduction operation return
5650    the stmt_vec_info the meta information is stored on.  */
5651
5652 stmt_vec_info
5653 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5654 {
5655   stmt_info = vect_orig_stmt (stmt_info);
5656   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5657   if (!is_a <gphi *> (stmt_info->stmt)
5658       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5659     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5660   gphi *phi = as_a <gphi *> (stmt_info->stmt);
5661   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5662     {
5663       if (gimple_phi_num_args (phi) == 1)
5664         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5665     }
5666   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5667     {
5668       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5669       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5670         stmt_info = info;
5671     }
5672   return stmt_info;
5673 }
5674
5675 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5676    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
5677    return false.  */
5678
5679 static bool
5680 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5681                                 stmt_vec_info reduc_info)
5682 {
5683   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5684   if (!main_loop_vinfo)
5685     return false;
5686
5687   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5688     return false;
5689
5690   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5691   auto_vec<tree, 16> main_loop_results (num_phis);
5692   auto_vec<tree, 16> initial_values (num_phis);
5693   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5694     {
5695       /* The epilogue loop can be entered either from the main loop or
5696          from an earlier guard block.  */
5697       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5698       for (tree incoming_value : reduc_info->reduc_initial_values)
5699         {
5700           /* Look for:
5701
5702                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5703                                     INITIAL_VALUE(guard block)>.  */
5704           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5705
5706           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5707           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5708
5709           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5710           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5711
5712           main_loop_results.quick_push (from_main_loop);
5713           initial_values.quick_push (from_skip);
5714         }
5715     }
5716   else
5717     /* The main loop dominates the epilogue loop.  */
5718     main_loop_results.splice (reduc_info->reduc_initial_values);
5719
5720   /* See if the main loop has the kind of accumulator we need.  */
5721   vect_reusable_accumulator *accumulator
5722     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5723   if (!accumulator
5724       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5725       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5726                       accumulator->reduc_info->reduc_scalar_results.begin ()))
5727     return false;
5728
5729   /* Handle the case where we can reduce wider vectors to narrower ones.  */
5730   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5731   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5732   unsigned HOST_WIDE_INT m;
5733   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5734                             TYPE_VECTOR_SUBPARTS (vectype), &m))
5735     return false;
5736   /* Check the intermediate vector types and operations are available.  */
5737   tree prev_vectype = old_vectype;
5738   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5739   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5740     {
5741       intermediate_nunits = exact_div (intermediate_nunits, 2);
5742       tree intermediate_vectype = get_related_vectype_for_scalar_type
5743         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5744       if (!intermediate_vectype
5745           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5746                                     intermediate_vectype)
5747           || !can_vec_extract (TYPE_MODE (prev_vectype),
5748                                TYPE_MODE (intermediate_vectype)))
5749         return false;
5750       prev_vectype = intermediate_vectype;
5751     }
5752
5753   /* Non-SLP reductions might apply an adjustment after the reduction
5754      operation, in order to simplify the initialization of the accumulator.
5755      If the epilogue loop carries on from where the main loop left off,
5756      it should apply the same adjustment to the final reduction result.
5757
5758      If the epilogue loop can also be entered directly (rather than via
5759      the main loop), we need to be able to handle that case in the same way,
5760      with the same adjustment.  (In principle we could add a PHI node
5761      to select the correct adjustment, but in practice that shouldn't be
5762      necessary.)  */
5763   tree main_adjustment
5764     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5765   if (loop_vinfo->main_loop_edge && main_adjustment)
5766     {
5767       gcc_assert (num_phis == 1);
5768       tree initial_value = initial_values[0];
5769       /* Check that we can use INITIAL_VALUE as the adjustment and
5770          initialize the accumulator with a neutral value instead.  */
5771       if (!operand_equal_p (initial_value, main_adjustment))
5772         return false;
5773       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5774       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5775                                                     code, initial_value);
5776     }
5777   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5778   reduc_info->reduc_initial_values.truncate (0);
5779   reduc_info->reduc_initial_values.splice (initial_values);
5780   reduc_info->reused_accumulator = accumulator;
5781   return true;
5782 }
5783
5784 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5785    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5786
5787 static tree
5788 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5789                             gimple_seq *seq)
5790 {
5791   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5792   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5793   tree stype = TREE_TYPE (vectype);
5794   tree new_temp = vec_def;
5795   while (nunits > nunits1)
5796     {
5797       nunits /= 2;
5798       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5799                                                            stype, nunits);
5800       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5801
5802       /* The target has to make sure we support lowpart/highpart
5803          extraction, either via direct vector extract or through
5804          an integer mode punning.  */
5805       tree dst1, dst2;
5806       gimple *epilog_stmt;
5807       if (convert_optab_handler (vec_extract_optab,
5808                                  TYPE_MODE (TREE_TYPE (new_temp)),
5809                                  TYPE_MODE (vectype1))
5810           != CODE_FOR_nothing)
5811         {
5812           /* Extract sub-vectors directly once vec_extract becomes
5813              a conversion optab.  */
5814           dst1 = make_ssa_name (vectype1);
5815           epilog_stmt
5816               = gimple_build_assign (dst1, BIT_FIELD_REF,
5817                                      build3 (BIT_FIELD_REF, vectype1,
5818                                              new_temp, TYPE_SIZE (vectype1),
5819                                              bitsize_int (0)));
5820           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5821           dst2 =  make_ssa_name (vectype1);
5822           epilog_stmt
5823               = gimple_build_assign (dst2, BIT_FIELD_REF,
5824                                      build3 (BIT_FIELD_REF, vectype1,
5825                                              new_temp, TYPE_SIZE (vectype1),
5826                                              bitsize_int (bitsize)));
5827           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5828         }
5829       else
5830         {
5831           /* Extract via punning to appropriately sized integer mode
5832              vector.  */
5833           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5834           tree etype = build_vector_type (eltype, 2);
5835           gcc_assert (convert_optab_handler (vec_extract_optab,
5836                                              TYPE_MODE (etype),
5837                                              TYPE_MODE (eltype))
5838                       != CODE_FOR_nothing);
5839           tree tem = make_ssa_name (etype);
5840           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5841                                              build1 (VIEW_CONVERT_EXPR,
5842                                                      etype, new_temp));
5843           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5844           new_temp = tem;
5845           tem = make_ssa_name (eltype);
5846           epilog_stmt
5847               = gimple_build_assign (tem, BIT_FIELD_REF,
5848                                      build3 (BIT_FIELD_REF, eltype,
5849                                              new_temp, TYPE_SIZE (eltype),
5850                                              bitsize_int (0)));
5851           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5852           dst1 = make_ssa_name (vectype1);
5853           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5854                                              build1 (VIEW_CONVERT_EXPR,
5855                                                      vectype1, tem));
5856           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5857           tem = make_ssa_name (eltype);
5858           epilog_stmt
5859               = gimple_build_assign (tem, BIT_FIELD_REF,
5860                                      build3 (BIT_FIELD_REF, eltype,
5861                                              new_temp, TYPE_SIZE (eltype),
5862                                              bitsize_int (bitsize)));
5863           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5864           dst2 =  make_ssa_name (vectype1);
5865           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5866                                              build1 (VIEW_CONVERT_EXPR,
5867                                                      vectype1, tem));
5868           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5869         }
5870
5871       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5872     }
5873
5874   return new_temp;
5875 }
5876
5877 /* Retrieves the definining statement to be used for a reduction.
5878    For MAIN_EXIT_P we use the current VEC_STMTs and otherwise we look at
5879    the reduction definitions.  */
5880
5881 tree
5882 vect_get_vect_def (stmt_vec_info reduc_info, slp_tree slp_node,
5883                    slp_instance slp_node_instance, bool main_exit_p, unsigned i,
5884                    vec <gimple *> &vec_stmts)
5885 {
5886   tree def;
5887
5888   if (slp_node)
5889     {
5890       if (!main_exit_p)
5891         slp_node = slp_node_instance->reduc_phis;
5892       def = vect_get_slp_vect_def (slp_node, i);
5893     }
5894   else
5895     {
5896       if (!main_exit_p)
5897         reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (reduc_info));
5898       vec_stmts = STMT_VINFO_VEC_STMTS (reduc_info);
5899       def = gimple_get_lhs (vec_stmts[0]);
5900     }
5901
5902   return def;
5903 }
5904
5905 /* Function vect_create_epilog_for_reduction
5906
5907    Create code at the loop-epilog to finalize the result of a reduction
5908    computation.
5909
5910    STMT_INFO is the scalar reduction stmt that is being vectorized.
5911    SLP_NODE is an SLP node containing a group of reduction statements. The
5912      first one in this group is STMT_INFO.
5913    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5914    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5915      (counting from 0)
5916    LOOP_EXIT is the edge to update in the merge block.  In the case of a single
5917      exit this edge is always the main loop exit.
5918
5919    This function:
5920    1. Completes the reduction def-use cycles.
5921    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5922       by calling the function specified by REDUC_FN if available, or by
5923       other means (whole-vector shifts or a scalar loop).
5924       The function also creates a new phi node at the loop exit to preserve
5925       loop-closed form, as illustrated below.
5926
5927      The flow at the entry to this function:
5928
5929         loop:
5930           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5931           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5932           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5933         loop_exit:
5934           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5935           use <s_out0>
5936           use <s_out0>
5937
5938      The above is transformed by this function into:
5939
5940         loop:
5941           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5942           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5943           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5944         loop_exit:
5945           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5946           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5947           v_out2 = reduce <v_out1>
5948           s_out3 = extract_field <v_out2, 0>
5949           s_out4 = adjust_result <s_out3>
5950           use <s_out4>
5951           use <s_out4>
5952 */
5953
5954 static void
5955 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5956                                   stmt_vec_info stmt_info,
5957                                   slp_tree slp_node,
5958                                   slp_instance slp_node_instance,
5959                                   edge loop_exit)
5960 {
5961   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5962   gcc_assert (reduc_info->is_reduc_info);
5963   /* For double reductions we need to get at the inner loop reduction
5964      stmt which has the meta info attached.  Our stmt_info is that of the
5965      loop-closed PHI of the inner loop which we remember as
5966      def for the reduction PHI generation.  */
5967   bool double_reduc = false;
5968   bool main_exit_p = LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit;
5969   stmt_vec_info rdef_info = stmt_info;
5970   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5971     {
5972       gcc_assert (!slp_node);
5973       double_reduc = true;
5974       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5975                                             (stmt_info->stmt, 0));
5976       stmt_info = vect_stmt_to_vectorize (stmt_info);
5977     }
5978   gphi *reduc_def_stmt
5979     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5980   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5981   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5982   tree vectype;
5983   machine_mode mode;
5984   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5985   basic_block exit_bb;
5986   tree scalar_dest;
5987   tree scalar_type;
5988   gimple *new_phi = NULL, *phi = NULL;
5989   gimple_stmt_iterator exit_gsi;
5990   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5991   gimple *epilog_stmt = NULL;
5992   gimple *exit_phi;
5993   tree bitsize;
5994   tree def;
5995   tree orig_name, scalar_result;
5996   imm_use_iterator imm_iter, phi_imm_iter;
5997   use_operand_p use_p, phi_use_p;
5998   gimple *use_stmt;
5999   auto_vec<tree> reduc_inputs;
6000   int j, i;
6001   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
6002   unsigned int group_size = 1, k;
6003   auto_vec<gimple *> phis;
6004   /* SLP reduction without reduction chain, e.g.,
6005      # a1 = phi <a2, a0>
6006      # b1 = phi <b2, b0>
6007      a2 = operation (a1)
6008      b2 = operation (b1)  */
6009   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
6010   bool direct_slp_reduc;
6011   tree induction_index = NULL_TREE;
6012
6013   if (slp_node)
6014     group_size = SLP_TREE_LANES (slp_node);
6015
6016   if (nested_in_vect_loop_p (loop, stmt_info))
6017     {
6018       outer_loop = loop;
6019       loop = loop->inner;
6020       gcc_assert (!slp_node && double_reduc);
6021     }
6022
6023   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
6024   gcc_assert (vectype);
6025   mode = TYPE_MODE (vectype);
6026
6027   tree induc_val = NULL_TREE;
6028   tree adjustment_def = NULL;
6029   if (slp_node)
6030     ;
6031   else
6032     {
6033       /* Optimize: for induction condition reduction, if we can't use zero
6034          for induc_val, use initial_def.  */
6035       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6036         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
6037       else if (double_reduc)
6038         ;
6039       else
6040         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
6041     }
6042
6043   stmt_vec_info single_live_out_stmt[] = { stmt_info };
6044   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
6045   if (slp_reduc)
6046     /* All statements produce live-out values.  */
6047     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
6048   else if (slp_node)
6049     {
6050       /* The last statement in the reduction chain produces the live-out
6051          value.  Note SLP optimization can shuffle scalar stmts to
6052          optimize permutations so we have to search for the last stmt.  */
6053       for (k = 0; k < group_size; ++k)
6054         if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
6055           {
6056             single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
6057             break;
6058           }
6059     }
6060
6061   unsigned vec_num;
6062   int ncopies;
6063   if (slp_node)
6064     {
6065       vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
6066       ncopies = 1;
6067     }
6068   else
6069     {
6070       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
6071       vec_num = 1;
6072       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
6073     }
6074
6075   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6076      which is updated with the current index of the loop for every match of
6077      the original loop's cond_expr (VEC_STMT).  This results in a vector
6078      containing the last time the condition passed for that vector lane.
6079      The first match will be a 1 to allow 0 to be used for non-matching
6080      indexes.  If there are no matches at all then the vector will be all
6081      zeroes.
6082
6083      PR92772: This algorithm is broken for architectures that support
6084      masked vectors, but do not provide fold_extract_last.  */
6085   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6086     {
6087       auto_vec<std::pair<tree, bool>, 2> ccompares;
6088       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6089       cond_info = vect_stmt_to_vectorize (cond_info);
6090       while (cond_info != reduc_info)
6091         {
6092           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6093             {
6094               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6095               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6096               ccompares.safe_push
6097                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
6098                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
6099             }
6100           cond_info
6101             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6102                                                  1 + STMT_VINFO_REDUC_IDX
6103                                                         (cond_info)));
6104           cond_info = vect_stmt_to_vectorize (cond_info);
6105         }
6106       gcc_assert (ccompares.length () != 0);
6107
6108       tree indx_before_incr, indx_after_incr;
6109       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6110       int scalar_precision
6111         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6112       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6113       tree cr_index_vector_type = get_related_vectype_for_scalar_type
6114         (TYPE_MODE (vectype), cr_index_scalar_type,
6115          TYPE_VECTOR_SUBPARTS (vectype));
6116
6117       /* First we create a simple vector induction variable which starts
6118          with the values {1,2,3,...} (SERIES_VECT) and increments by the
6119          vector size (STEP).  */
6120
6121       /* Create a {1,2,3,...} vector.  */
6122       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6123
6124       /* Create a vector of the step value.  */
6125       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6126       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6127
6128       /* Create an induction variable.  */
6129       gimple_stmt_iterator incr_gsi;
6130       bool insert_after;
6131       vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
6132       create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6133                  insert_after, &indx_before_incr, &indx_after_incr);
6134
6135       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6136          filled with zeros (VEC_ZERO).  */
6137
6138       /* Create a vector of 0s.  */
6139       tree zero = build_zero_cst (cr_index_scalar_type);
6140       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6141
6142       /* Create a vector phi node.  */
6143       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6144       new_phi = create_phi_node (new_phi_tree, loop->header);
6145       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6146                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
6147
6148       /* Now take the condition from the loops original cond_exprs
6149          and produce a new cond_exprs (INDEX_COND_EXPR) which for
6150          every match uses values from the induction variable
6151          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6152          (NEW_PHI_TREE).
6153          Finally, we update the phi (NEW_PHI_TREE) to take the value of
6154          the new cond_expr (INDEX_COND_EXPR).  */
6155       gimple_seq stmts = NULL;
6156       for (int i = ccompares.length () - 1; i != -1; --i)
6157         {
6158           tree ccompare = ccompares[i].first;
6159           if (ccompares[i].second)
6160             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6161                                          cr_index_vector_type,
6162                                          ccompare,
6163                                          indx_before_incr, new_phi_tree);
6164           else
6165             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6166                                          cr_index_vector_type,
6167                                          ccompare,
6168                                          new_phi_tree, indx_before_incr);
6169         }
6170       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6171
6172       /* Update the phi with the vec cond.  */
6173       induction_index = new_phi_tree;
6174       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6175                    loop_latch_edge (loop), UNKNOWN_LOCATION);
6176     }
6177
6178   /* 2. Create epilog code.
6179         The reduction epilog code operates across the elements of the vector
6180         of partial results computed by the vectorized loop.
6181         The reduction epilog code consists of:
6182
6183         step 1: compute the scalar result in a vector (v_out2)
6184         step 2: extract the scalar result (s_out3) from the vector (v_out2)
6185         step 3: adjust the scalar result (s_out3) if needed.
6186
6187         Step 1 can be accomplished using one the following three schemes:
6188           (scheme 1) using reduc_fn, if available.
6189           (scheme 2) using whole-vector shifts, if available.
6190           (scheme 3) using a scalar loop. In this case steps 1+2 above are
6191                      combined.
6192
6193           The overall epilog code looks like this:
6194
6195           s_out0 = phi <s_loop>         # original EXIT_PHI
6196           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
6197           v_out2 = reduce <v_out1>              # step 1
6198           s_out3 = extract_field <v_out2, 0>    # step 2
6199           s_out4 = adjust_result <s_out3>       # step 3
6200
6201           (step 3 is optional, and steps 1 and 2 may be combined).
6202           Lastly, the uses of s_out0 are replaced by s_out4.  */
6203
6204
6205   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6206          v_out1 = phi <VECT_DEF>
6207          Store them in NEW_PHIS.  */
6208   if (double_reduc)
6209     loop = outer_loop;
6210   /* We need to reduce values in all exits.  */
6211   exit_bb = loop_exit->dest;
6212   exit_gsi = gsi_after_labels (exit_bb);
6213   reduc_inputs.create (slp_node ? vec_num : ncopies);
6214   vec <gimple *> vec_stmts = vNULL;
6215   for (unsigned i = 0; i < vec_num; i++)
6216     {
6217       gimple_seq stmts = NULL;
6218       def = vect_get_vect_def (rdef_info, slp_node, slp_node_instance,
6219                                main_exit_p, i, vec_stmts);
6220       for (j = 0; j < ncopies; j++)
6221         {
6222           tree new_def = copy_ssa_name (def);
6223           phi = create_phi_node (new_def, exit_bb);
6224           if (j)
6225             def = gimple_get_lhs (vec_stmts[j]);
6226           SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
6227           new_def = gimple_convert (&stmts, vectype, new_def);
6228           reduc_inputs.quick_push (new_def);
6229         }
6230       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6231     }
6232
6233   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6234          (i.e. when reduc_fn is not available) and in the final adjustment
6235          code (if needed).  Also get the original scalar reduction variable as
6236          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
6237          represents a reduction pattern), the tree-code and scalar-def are
6238          taken from the original stmt that the pattern-stmt (STMT) replaces.
6239          Otherwise (it is a regular reduction) - the tree-code and scalar-def
6240          are taken from STMT.  */
6241
6242   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6243   if (orig_stmt_info != stmt_info)
6244     {
6245       /* Reduction pattern  */
6246       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6247       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6248     }
6249
6250   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6251   scalar_type = TREE_TYPE (scalar_dest);
6252   scalar_results.truncate (0);
6253   scalar_results.reserve_exact (group_size);
6254   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6255   bitsize = TYPE_SIZE (scalar_type);
6256
6257   /* True if we should implement SLP_REDUC using native reduction operations
6258      instead of scalar operations.  */
6259   direct_slp_reduc = (reduc_fn != IFN_LAST
6260                       && slp_reduc
6261                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6262
6263   /* In case of reduction chain, e.g.,
6264      # a1 = phi <a3, a0>
6265      a2 = operation (a1)
6266      a3 = operation (a2),
6267
6268      we may end up with more than one vector result.  Here we reduce them
6269      to one vector.
6270
6271      The same is true for a SLP reduction, e.g.,
6272      # a1 = phi <a2, a0>
6273      # b1 = phi <b2, b0>
6274      a2 = operation (a1)
6275      b2 = operation (a2),
6276
6277      where we can end up with more than one vector as well.  We can
6278      easily accumulate vectors when the number of vector elements is
6279      a multiple of the SLP group size.
6280
6281      The same is true if we couldn't use a single defuse cycle.  */
6282   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6283       || direct_slp_reduc
6284       || (slp_reduc
6285           && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6286       || ncopies > 1)
6287     {
6288       gimple_seq stmts = NULL;
6289       tree single_input = reduc_inputs[0];
6290       for (k = 1; k < reduc_inputs.length (); k++)
6291         single_input = gimple_build (&stmts, code, vectype,
6292                                      single_input, reduc_inputs[k]);
6293       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6294
6295       reduc_inputs.truncate (0);
6296       reduc_inputs.safe_push (single_input);
6297     }
6298
6299   tree orig_reduc_input = reduc_inputs[0];
6300
6301   /* If this loop is an epilogue loop that can be skipped after the
6302      main loop, we can only share a reduction operation between the
6303      main loop and the epilogue if we put it at the target of the
6304      skip edge.
6305
6306      We can still reuse accumulators if this check fails.  Doing so has
6307      the minor(?) benefit of making the epilogue loop's scalar result
6308      independent of the main loop's scalar result.  */
6309   bool unify_with_main_loop_p = false;
6310   if (reduc_info->reused_accumulator
6311       && loop_vinfo->skip_this_loop_edge
6312       && single_succ_p (exit_bb)
6313       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6314     {
6315       unify_with_main_loop_p = true;
6316
6317       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6318       reduc_inputs[0] = make_ssa_name (vectype);
6319       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6320       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6321                    UNKNOWN_LOCATION);
6322       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6323                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6324       exit_gsi = gsi_after_labels (reduc_block);
6325     }
6326
6327   /* Shouldn't be used beyond this point.  */
6328   exit_bb = nullptr;
6329
6330   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6331       && reduc_fn != IFN_LAST)
6332     {
6333       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6334          various data values where the condition matched and another vector
6335          (INDUCTION_INDEX) containing all the indexes of those matches.  We
6336          need to extract the last matching index (which will be the index with
6337          highest value) and use this to index into the data vector.
6338          For the case where there were no matches, the data vector will contain
6339          all default values and the index vector will be all zeros.  */
6340
6341       /* Get various versions of the type of the vector of indexes.  */
6342       tree index_vec_type = TREE_TYPE (induction_index);
6343       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6344       tree index_scalar_type = TREE_TYPE (index_vec_type);
6345       tree index_vec_cmp_type = truth_type_for (index_vec_type);
6346
6347       /* Get an unsigned integer version of the type of the data vector.  */
6348       int scalar_precision
6349         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6350       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6351       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6352                                                 vectype);
6353
6354       /* First we need to create a vector (ZERO_VEC) of zeros and another
6355          vector (MAX_INDEX_VEC) filled with the last matching index, which we
6356          can create using a MAX reduction and then expanding.
6357          In the case where the loop never made any matches, the max index will
6358          be zero.  */
6359
6360       /* Vector of {0, 0, 0,...}.  */
6361       tree zero_vec = build_zero_cst (vectype);
6362
6363       /* Find maximum value from the vector of found indexes.  */
6364       tree max_index = make_ssa_name (index_scalar_type);
6365       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6366                                                           1, induction_index);
6367       gimple_call_set_lhs (max_index_stmt, max_index);
6368       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6369
6370       /* Vector of {max_index, max_index, max_index,...}.  */
6371       tree max_index_vec = make_ssa_name (index_vec_type);
6372       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6373                                                       max_index);
6374       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6375                                                         max_index_vec_rhs);
6376       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6377
6378       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6379          with the vector (INDUCTION_INDEX) of found indexes, choosing values
6380          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6381          otherwise.  Only one value should match, resulting in a vector
6382          (VEC_COND) with one data value and the rest zeros.
6383          In the case where the loop never made any matches, every index will
6384          match, resulting in a vector with all data values (which will all be
6385          the default value).  */
6386
6387       /* Compare the max index vector to the vector of found indexes to find
6388          the position of the max value.  */
6389       tree vec_compare = make_ssa_name (index_vec_cmp_type);
6390       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6391                                                       induction_index,
6392                                                       max_index_vec);
6393       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6394
6395       /* Use the compare to choose either values from the data vector or
6396          zero.  */
6397       tree vec_cond = make_ssa_name (vectype);
6398       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6399                                                    vec_compare,
6400                                                    reduc_inputs[0],
6401                                                    zero_vec);
6402       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6403
6404       /* Finally we need to extract the data value from the vector (VEC_COND)
6405          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
6406          reduction, but because this doesn't exist, we can use a MAX reduction
6407          instead.  The data value might be signed or a float so we need to cast
6408          it first.
6409          In the case where the loop never made any matches, the data values are
6410          all identical, and so will reduce down correctly.  */
6411
6412       /* Make the matched data values unsigned.  */
6413       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6414       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6415                                        vec_cond);
6416       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6417                                                         VIEW_CONVERT_EXPR,
6418                                                         vec_cond_cast_rhs);
6419       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6420
6421       /* Reduce down to a scalar value.  */
6422       tree data_reduc = make_ssa_name (scalar_type_unsigned);
6423       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6424                                                            1, vec_cond_cast);
6425       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6426       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6427
6428       /* Convert the reduced value back to the result type and set as the
6429          result.  */
6430       gimple_seq stmts = NULL;
6431       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6432                                data_reduc);
6433       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6434       scalar_results.safe_push (new_temp);
6435     }
6436   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6437            && reduc_fn == IFN_LAST)
6438     {
6439       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
6440          idx = 0;
6441          idx_val = induction_index[0];
6442          val = data_reduc[0];
6443          for (idx = 0, val = init, i = 0; i < nelts; ++i)
6444            if (induction_index[i] > idx_val)
6445              val = data_reduc[i], idx_val = induction_index[i];
6446          return val;  */
6447
6448       tree data_eltype = TREE_TYPE (vectype);
6449       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6450       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6451       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6452       /* Enforced by vectorizable_reduction, which ensures we have target
6453          support before allowing a conditional reduction on variable-length
6454          vectors.  */
6455       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6456       tree idx_val = NULL_TREE, val = NULL_TREE;
6457       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6458         {
6459           tree old_idx_val = idx_val;
6460           tree old_val = val;
6461           idx_val = make_ssa_name (idx_eltype);
6462           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6463                                              build3 (BIT_FIELD_REF, idx_eltype,
6464                                                      induction_index,
6465                                                      bitsize_int (el_size),
6466                                                      bitsize_int (off)));
6467           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6468           val = make_ssa_name (data_eltype);
6469           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6470                                              build3 (BIT_FIELD_REF,
6471                                                      data_eltype,
6472                                                      reduc_inputs[0],
6473                                                      bitsize_int (el_size),
6474                                                      bitsize_int (off)));
6475           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6476           if (off != 0)
6477             {
6478               tree new_idx_val = idx_val;
6479               if (off != v_size - el_size)
6480                 {
6481                   new_idx_val = make_ssa_name (idx_eltype);
6482                   epilog_stmt = gimple_build_assign (new_idx_val,
6483                                                      MAX_EXPR, idx_val,
6484                                                      old_idx_val);
6485                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6486                 }
6487               tree cond = make_ssa_name (boolean_type_node);
6488               epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6489                                                  idx_val, old_idx_val);
6490               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6491               tree new_val = make_ssa_name (data_eltype);
6492               epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6493                                                  cond, val, old_val);
6494               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6495               idx_val = new_idx_val;
6496               val = new_val;
6497             }
6498         }
6499       /* Convert the reduced value back to the result type and set as the
6500          result.  */
6501       gimple_seq stmts = NULL;
6502       val = gimple_convert (&stmts, scalar_type, val);
6503       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6504       scalar_results.safe_push (val);
6505     }
6506
6507   /* 2.3 Create the reduction code, using one of the three schemes described
6508          above. In SLP we simply need to extract all the elements from the
6509          vector (without reducing them), so we use scalar shifts.  */
6510   else if (reduc_fn != IFN_LAST && !slp_reduc)
6511     {
6512       tree tmp;
6513       tree vec_elem_type;
6514
6515       /* Case 1:  Create:
6516          v_out2 = reduc_expr <v_out1>  */
6517
6518       if (dump_enabled_p ())
6519         dump_printf_loc (MSG_NOTE, vect_location,
6520                          "Reduce using direct vector reduction.\n");
6521
6522       gimple_seq stmts = NULL;
6523       vec_elem_type = TREE_TYPE (vectype);
6524       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6525                                vec_elem_type, reduc_inputs[0]);
6526       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6527       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6528
6529       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6530           && induc_val)
6531         {
6532           /* Earlier we set the initial value to be a vector if induc_val
6533              values.  Check the result and if it is induc_val then replace
6534              with the original initial value, unless induc_val is
6535              the same as initial_def already.  */
6536           tree zcompare = make_ssa_name (boolean_type_node);
6537           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6538                                              new_temp, induc_val);
6539           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6540           tree initial_def = reduc_info->reduc_initial_values[0];
6541           tmp = make_ssa_name (new_scalar_dest);
6542           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6543                                              initial_def, new_temp);
6544           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6545           new_temp = tmp;
6546         }
6547
6548       scalar_results.safe_push (new_temp);
6549     }
6550   else if (direct_slp_reduc)
6551     {
6552       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6553          with the elements for other SLP statements replaced with the
6554          neutral value.  We can then do a normal reduction on each vector.  */
6555
6556       /* Enforced by vectorizable_reduction.  */
6557       gcc_assert (reduc_inputs.length () == 1);
6558       gcc_assert (pow2p_hwi (group_size));
6559
6560       gimple_seq seq = NULL;
6561
6562       /* Build a vector {0, 1, 2, ...}, with the same number of elements
6563          and the same element size as VECTYPE.  */
6564       tree index = build_index_vector (vectype, 0, 1);
6565       tree index_type = TREE_TYPE (index);
6566       tree index_elt_type = TREE_TYPE (index_type);
6567       tree mask_type = truth_type_for (index_type);
6568
6569       /* Create a vector that, for each element, identifies which of
6570          the REDUC_GROUP_SIZE results should use it.  */
6571       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6572       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6573                             build_vector_from_val (index_type, index_mask));
6574
6575       /* Get a neutral vector value.  This is simply a splat of the neutral
6576          scalar value if we have one, otherwise the initial scalar value
6577          is itself a neutral value.  */
6578       tree vector_identity = NULL_TREE;
6579       tree neutral_op = NULL_TREE;
6580       if (slp_node)
6581         {
6582           tree initial_value = NULL_TREE;
6583           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6584             initial_value = reduc_info->reduc_initial_values[0];
6585           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6586                                                  initial_value, false);
6587         }
6588       if (neutral_op)
6589         vector_identity = gimple_build_vector_from_val (&seq, vectype,
6590                                                         neutral_op);
6591       for (unsigned int i = 0; i < group_size; ++i)
6592         {
6593           /* If there's no univeral neutral value, we can use the
6594              initial scalar value from the original PHI.  This is used
6595              for MIN and MAX reduction, for example.  */
6596           if (!neutral_op)
6597             {
6598               tree scalar_value = reduc_info->reduc_initial_values[i];
6599               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6600                                              scalar_value);
6601               vector_identity = gimple_build_vector_from_val (&seq, vectype,
6602                                                               scalar_value);
6603             }
6604
6605           /* Calculate the equivalent of:
6606
6607              sel[j] = (index[j] == i);
6608
6609              which selects the elements of REDUC_INPUTS[0] that should
6610              be included in the result.  */
6611           tree compare_val = build_int_cst (index_elt_type, i);
6612           compare_val = build_vector_from_val (index_type, compare_val);
6613           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6614                                    index, compare_val);
6615
6616           /* Calculate the equivalent of:
6617
6618              vec = seq ? reduc_inputs[0] : vector_identity;
6619
6620              VEC is now suitable for a full vector reduction.  */
6621           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6622                                    sel, reduc_inputs[0], vector_identity);
6623
6624           /* Do the reduction and convert it to the appropriate type.  */
6625           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6626                                       TREE_TYPE (vectype), vec);
6627           scalar = gimple_convert (&seq, scalar_type, scalar);
6628           scalar_results.safe_push (scalar);
6629         }
6630       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6631     }
6632   else
6633     {
6634       bool reduce_with_shift;
6635       tree vec_temp;
6636
6637       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6638
6639       /* See if the target wants to do the final (shift) reduction
6640          in a vector mode of smaller size and first reduce upper/lower
6641          halves against each other.  */
6642       enum machine_mode mode1 = mode;
6643       tree stype = TREE_TYPE (vectype);
6644       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6645       unsigned nunits1 = nunits;
6646       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6647           && reduc_inputs.length () == 1)
6648         {
6649           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6650           /* For SLP reductions we have to make sure lanes match up, but
6651              since we're doing individual element final reduction reducing
6652              vector width here is even more important.
6653              ???  We can also separate lanes with permutes, for the common
6654              case of power-of-two group-size odd/even extracts would work.  */
6655           if (slp_reduc && nunits != nunits1)
6656             {
6657               nunits1 = least_common_multiple (nunits1, group_size);
6658               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6659             }
6660         }
6661       if (!slp_reduc
6662           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6663         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6664
6665       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6666                                                            stype, nunits1);
6667       reduce_with_shift = have_whole_vector_shift (mode1);
6668       if (!VECTOR_MODE_P (mode1)
6669           || !directly_supported_p (code, vectype1))
6670         reduce_with_shift = false;
6671
6672       /* First reduce the vector to the desired vector size we should
6673          do shift reduction on by combining upper and lower halves.  */
6674       gimple_seq stmts = NULL;
6675       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6676                                              code, &stmts);
6677       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6678       reduc_inputs[0] = new_temp;
6679
6680       if (reduce_with_shift && !slp_reduc)
6681         {
6682           int element_bitsize = tree_to_uhwi (bitsize);
6683           /* Enforced by vectorizable_reduction, which disallows SLP reductions
6684              for variable-length vectors and also requires direct target support
6685              for loop reductions.  */
6686           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6687           int nelements = vec_size_in_bits / element_bitsize;
6688           vec_perm_builder sel;
6689           vec_perm_indices indices;
6690
6691           int elt_offset;
6692
6693           tree zero_vec = build_zero_cst (vectype1);
6694           /* Case 2: Create:
6695              for (offset = nelements/2; offset >= 1; offset/=2)
6696                 {
6697                   Create:  va' = vec_shift <va, offset>
6698                   Create:  va = vop <va, va'>
6699                 }  */
6700
6701           tree rhs;
6702
6703           if (dump_enabled_p ())
6704             dump_printf_loc (MSG_NOTE, vect_location,
6705                              "Reduce using vector shifts\n");
6706
6707           gimple_seq stmts = NULL;
6708           new_temp = gimple_convert (&stmts, vectype1, new_temp);
6709           for (elt_offset = nelements / 2;
6710                elt_offset >= 1;
6711                elt_offset /= 2)
6712             {
6713               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6714               indices.new_vector (sel, 2, nelements);
6715               tree mask = vect_gen_perm_mask_any (vectype1, indices);
6716               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6717                                        new_temp, zero_vec, mask);
6718               new_temp = gimple_build (&stmts, code,
6719                                        vectype1, new_name, new_temp);
6720             }
6721           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6722
6723           /* 2.4  Extract the final scalar result.  Create:
6724              s_out3 = extract_field <v_out2, bitpos>  */
6725
6726           if (dump_enabled_p ())
6727             dump_printf_loc (MSG_NOTE, vect_location,
6728                              "extract scalar result\n");
6729
6730           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6731                         bitsize, bitsize_zero_node);
6732           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6733           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6734           gimple_assign_set_lhs (epilog_stmt, new_temp);
6735           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6736           scalar_results.safe_push (new_temp);
6737         }
6738       else
6739         {
6740           /* Case 3: Create:
6741              s = extract_field <v_out2, 0>
6742              for (offset = element_size;
6743                   offset < vector_size;
6744                   offset += element_size;)
6745                {
6746                  Create:  s' = extract_field <v_out2, offset>
6747                  Create:  s = op <s, s'>  // For non SLP cases
6748                }  */
6749
6750           if (dump_enabled_p ())
6751             dump_printf_loc (MSG_NOTE, vect_location,
6752                              "Reduce using scalar code.\n");
6753
6754           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6755           int element_bitsize = tree_to_uhwi (bitsize);
6756           tree compute_type = TREE_TYPE (vectype);
6757           gimple_seq stmts = NULL;
6758           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6759             {
6760               int bit_offset;
6761               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6762                                        vec_temp, bitsize, bitsize_zero_node);
6763
6764               /* In SLP we don't need to apply reduction operation, so we just
6765                  collect s' values in SCALAR_RESULTS.  */
6766               if (slp_reduc)
6767                 scalar_results.safe_push (new_temp);
6768
6769               for (bit_offset = element_bitsize;
6770                    bit_offset < vec_size_in_bits;
6771                    bit_offset += element_bitsize)
6772                 {
6773                   tree bitpos = bitsize_int (bit_offset);
6774                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
6775                                            compute_type, vec_temp,
6776                                            bitsize, bitpos);
6777                   if (slp_reduc)
6778                     {
6779                       /* In SLP we don't need to apply reduction operation, so
6780                          we just collect s' values in SCALAR_RESULTS.  */
6781                       new_temp = new_name;
6782                       scalar_results.safe_push (new_name);
6783                     }
6784                   else
6785                     new_temp = gimple_build (&stmts, code, compute_type,
6786                                              new_name, new_temp);
6787                 }
6788             }
6789
6790           /* The only case where we need to reduce scalar results in SLP, is
6791              unrolling.  If the size of SCALAR_RESULTS is greater than
6792              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6793              REDUC_GROUP_SIZE.  */
6794           if (slp_reduc)
6795             {
6796               tree res, first_res, new_res;
6797
6798               /* Reduce multiple scalar results in case of SLP unrolling.  */
6799               for (j = group_size; scalar_results.iterate (j, &res);
6800                    j++)
6801                 {
6802                   first_res = scalar_results[j % group_size];
6803                   new_res = gimple_build (&stmts, code, compute_type,
6804                                           first_res, res);
6805                   scalar_results[j % group_size] = new_res;
6806                 }
6807               scalar_results.truncate (group_size);
6808               for (k = 0; k < group_size; k++)
6809                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6810                                                     scalar_results[k]);
6811             }
6812           else
6813             {
6814               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6815               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6816               scalar_results.safe_push (new_temp);
6817             }
6818
6819           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6820         }
6821
6822       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6823           && induc_val)
6824         {
6825           /* Earlier we set the initial value to be a vector if induc_val
6826              values.  Check the result and if it is induc_val then replace
6827              with the original initial value, unless induc_val is
6828              the same as initial_def already.  */
6829           tree zcompare = make_ssa_name (boolean_type_node);
6830           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6831                                              induc_val);
6832           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6833           tree initial_def = reduc_info->reduc_initial_values[0];
6834           tree tmp = make_ssa_name (new_scalar_dest);
6835           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6836                                              initial_def, new_temp);
6837           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6838           scalar_results[0] = tmp;
6839         }
6840     }
6841
6842   /* 2.5 Adjust the final result by the initial value of the reduction
6843          variable. (When such adjustment is not needed, then
6844          'adjustment_def' is zero).  For example, if code is PLUS we create:
6845          new_temp = loop_exit_def + adjustment_def  */
6846
6847   if (adjustment_def)
6848     {
6849       gcc_assert (!slp_reduc);
6850       gimple_seq stmts = NULL;
6851       if (double_reduc)
6852         {
6853           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6854           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6855           new_temp = gimple_build (&stmts, code, vectype,
6856                                    reduc_inputs[0], adjustment_def);
6857         }
6858       else
6859         {
6860           new_temp = scalar_results[0];
6861           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6862           adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6863                                            adjustment_def);
6864           new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6865           new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6866                                    new_temp, adjustment_def);
6867           new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6868         }
6869
6870       epilog_stmt = gimple_seq_last_stmt (stmts);
6871       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6872       scalar_results[0] = new_temp;
6873     }
6874
6875   /* Record this operation if it could be reused by the epilogue loop.  */
6876   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6877       && reduc_inputs.length () == 1)
6878     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6879                                            { orig_reduc_input, reduc_info });
6880
6881   if (double_reduc)
6882     loop = outer_loop;
6883
6884   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6885           phis with new adjusted scalar results, i.e., replace use <s_out0>
6886           with use <s_out4>.
6887
6888      Transform:
6889         loop_exit:
6890           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6891           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6892           v_out2 = reduce <v_out1>
6893           s_out3 = extract_field <v_out2, 0>
6894           s_out4 = adjust_result <s_out3>
6895           use <s_out0>
6896           use <s_out0>
6897
6898      into:
6899
6900         loop_exit:
6901           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6902           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6903           v_out2 = reduce <v_out1>
6904           s_out3 = extract_field <v_out2, 0>
6905           s_out4 = adjust_result <s_out3>
6906           use <s_out4>
6907           use <s_out4> */
6908
6909   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6910   for (k = 0; k < live_out_stmts.size (); k++)
6911     {
6912       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6913       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6914
6915       phis.create (3);
6916       /* Find the loop-closed-use at the loop exit of the original scalar
6917          result.  (The reduction result is expected to have two immediate uses,
6918          one at the latch block, and one at the loop exit).  For double
6919          reductions we are looking for exit phis of the outer loop.  */
6920       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6921         {
6922           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6923             {
6924               if (!is_gimple_debug (USE_STMT (use_p)))
6925                 phis.safe_push (USE_STMT (use_p));
6926             }
6927           else
6928             {
6929               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6930                 {
6931                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6932
6933                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6934                     {
6935                       if (!flow_bb_inside_loop_p (loop,
6936                                              gimple_bb (USE_STMT (phi_use_p)))
6937                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6938                         phis.safe_push (USE_STMT (phi_use_p));
6939                     }
6940                 }
6941             }
6942         }
6943
6944       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6945         {
6946           /* Replace the uses:  */
6947           orig_name = PHI_RESULT (exit_phi);
6948
6949           /* Look for a single use at the target of the skip edge.  */
6950           if (unify_with_main_loop_p)
6951             {
6952               use_operand_p use_p;
6953               gimple *user;
6954               if (!single_imm_use (orig_name, &use_p, &user))
6955                 gcc_unreachable ();
6956               orig_name = gimple_get_lhs (user);
6957             }
6958
6959           scalar_result = scalar_results[k];
6960           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6961             {
6962               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6963                 SET_USE (use_p, scalar_result);
6964               update_stmt (use_stmt);
6965             }
6966         }
6967
6968       phis.release ();
6969     }
6970 }
6971
6972 /* Return a vector of type VECTYPE that is equal to the vector select
6973    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6974    before GSI.  */
6975
6976 static tree
6977 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6978                      tree vec, tree identity)
6979 {
6980   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6981   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6982                                           mask, vec, identity);
6983   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6984   return cond;
6985 }
6986
6987 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6988    order, starting with LHS.  Insert the extraction statements before GSI and
6989    associate the new scalar SSA names with variable SCALAR_DEST.
6990    If MASK is nonzero mask the input and then operate on it unconditionally.
6991    Return the SSA name for the result.  */
6992
6993 static tree
6994 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6995                        tree_code code, tree lhs, tree vector_rhs,
6996                        tree mask)
6997 {
6998   tree vectype = TREE_TYPE (vector_rhs);
6999   tree scalar_type = TREE_TYPE (vectype);
7000   tree bitsize = TYPE_SIZE (scalar_type);
7001   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
7002   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
7003
7004   /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
7005      to perform an unconditional element-wise reduction of it.  */
7006   if (mask)
7007     {
7008       tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
7009                                                    "masked_vector_rhs");
7010       tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
7011                                                   false);
7012       tree vector_identity = build_vector_from_val (vectype, neutral_op);
7013       gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
7014                                              mask, vector_rhs, vector_identity);
7015       gsi_insert_before (gsi, select, GSI_SAME_STMT);
7016       vector_rhs = masked_vector_rhs;
7017     }
7018
7019   for (unsigned HOST_WIDE_INT bit_offset = 0;
7020        bit_offset < vec_size_in_bits;
7021        bit_offset += element_bitsize)
7022     {
7023       tree bitpos = bitsize_int (bit_offset);
7024       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
7025                          bitsize, bitpos);
7026
7027       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
7028       rhs = make_ssa_name (scalar_dest, stmt);
7029       gimple_assign_set_lhs (stmt, rhs);
7030       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7031
7032       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
7033       tree new_name = make_ssa_name (scalar_dest, stmt);
7034       gimple_assign_set_lhs (stmt, new_name);
7035       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7036       lhs = new_name;
7037     }
7038   return lhs;
7039 }
7040
7041 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
7042    type of the vector input.  */
7043
7044 static internal_fn
7045 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
7046 {
7047   internal_fn mask_reduc_fn;
7048   internal_fn mask_len_reduc_fn;
7049
7050   switch (reduc_fn)
7051     {
7052     case IFN_FOLD_LEFT_PLUS:
7053       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
7054       mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
7055       break;
7056
7057     default:
7058       return IFN_LAST;
7059     }
7060
7061   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
7062                                       OPTIMIZE_FOR_SPEED))
7063     return mask_reduc_fn;
7064   if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
7065                                       OPTIMIZE_FOR_SPEED))
7066     return mask_len_reduc_fn;
7067   return IFN_LAST;
7068 }
7069
7070 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
7071    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
7072    statement.  CODE is the operation performed by STMT_INFO and OPS are
7073    its scalar operands.  REDUC_INDEX is the index of the operand in
7074    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
7075    implements in-order reduction, or IFN_LAST if we should open-code it.
7076    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
7077    that should be used to control the operation in a fully-masked loop.  */
7078
7079 static bool
7080 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
7081                                stmt_vec_info stmt_info,
7082                                gimple_stmt_iterator *gsi,
7083                                gimple **vec_stmt, slp_tree slp_node,
7084                                gimple *reduc_def_stmt,
7085                                code_helper code, internal_fn reduc_fn,
7086                                tree *ops, int num_ops, tree vectype_in,
7087                                int reduc_index, vec_loop_masks *masks,
7088                                vec_loop_lens *lens)
7089 {
7090   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7091   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7092   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
7093
7094   int ncopies;
7095   if (slp_node)
7096     ncopies = 1;
7097   else
7098     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7099
7100   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7101   gcc_assert (ncopies == 1);
7102
7103   bool is_cond_op = false;
7104   if (!code.is_tree_code ())
7105     {
7106       code = conditional_internal_fn_code (internal_fn (code));
7107       gcc_assert (code != ERROR_MARK);
7108       is_cond_op = true;
7109     }
7110
7111   gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7112
7113   if (slp_node)
7114     {
7115       if (is_cond_op)
7116         {
7117           if (dump_enabled_p ())
7118             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7119                              "fold-left reduction on SLP not supported.\n");
7120           return false;
7121         }
7122
7123       gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7124                             TYPE_VECTOR_SUBPARTS (vectype_in)));
7125     }
7126
7127   /* The operands either come from a binary operation or an IFN_COND operation.
7128      The former is a gimple assign with binary rhs and the latter is a
7129      gimple call with four arguments.  */
7130   gcc_assert (num_ops == 2 || num_ops == 4);
7131   tree op0, opmask;
7132   if (!is_cond_op)
7133     op0 = ops[1 - reduc_index];
7134   else
7135     {
7136       op0 = ops[2 + (1 - reduc_index)];
7137       opmask = ops[0];
7138       gcc_assert (!slp_node);
7139     }
7140
7141   int group_size = 1;
7142   stmt_vec_info scalar_dest_def_info;
7143   auto_vec<tree> vec_oprnds0, vec_opmask;
7144   if (slp_node)
7145     {
7146       auto_vec<vec<tree> > vec_defs (2);
7147       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
7148       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
7149       vec_defs[0].release ();
7150       vec_defs[1].release ();
7151       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7152       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7153     }
7154   else
7155     {
7156       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7157                                      op0, &vec_oprnds0);
7158       scalar_dest_def_info = stmt_info;
7159
7160       /* For an IFN_COND_OP we also need the vector mask operand.  */
7161       if (is_cond_op)
7162           vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7163                                          opmask, &vec_opmask);
7164     }
7165
7166   gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
7167   tree scalar_dest = gimple_get_lhs (sdef);
7168   tree scalar_type = TREE_TYPE (scalar_dest);
7169   tree reduc_var = gimple_phi_result (reduc_def_stmt);
7170
7171   int vec_num = vec_oprnds0.length ();
7172   gcc_assert (vec_num == 1 || slp_node);
7173   tree vec_elem_type = TREE_TYPE (vectype_out);
7174   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7175
7176   tree vector_identity = NULL_TREE;
7177   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7178     {
7179       vector_identity = build_zero_cst (vectype_out);
7180       if (!HONOR_SIGNED_ZEROS (vectype_out))
7181         ;
7182       else
7183         {
7184           gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7185           vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7186                                         vector_identity);
7187         }
7188     }
7189
7190   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7191   int i;
7192   tree def0;
7193   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7194     {
7195       gimple *new_stmt;
7196       tree mask = NULL_TREE;
7197       tree len = NULL_TREE;
7198       tree bias = NULL_TREE;
7199       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7200         mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7201       else if (is_cond_op)
7202         mask = vec_opmask[0];
7203       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7204         {
7205           len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7206                                    i, 1);
7207           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7208           bias = build_int_cst (intQI_type_node, biasval);
7209           if (!is_cond_op)
7210             mask = build_minus_one_cst (truth_type_for (vectype_in));
7211         }
7212
7213       /* Handle MINUS by adding the negative.  */
7214       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7215         {
7216           tree negated = make_ssa_name (vectype_out);
7217           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7218           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7219           def0 = negated;
7220         }
7221
7222       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7223           && mask && mask_reduc_fn == IFN_LAST)
7224         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7225                                     vector_identity);
7226
7227       /* On the first iteration the input is simply the scalar phi
7228          result, and for subsequent iterations it is the output of
7229          the preceding operation.  */
7230       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7231         {
7232           if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7233             new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7234                                                    def0, mask, len, bias);
7235           else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7236             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7237                                                    def0, mask);
7238           else
7239             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7240                                                    def0);
7241           /* For chained SLP reductions the output of the previous reduction
7242              operation serves as the input of the next. For the final statement
7243              the output cannot be a temporary - we reuse the original
7244              scalar destination of the last statement.  */
7245           if (i != vec_num - 1)
7246             {
7247               gimple_set_lhs (new_stmt, scalar_dest_var);
7248               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7249               gimple_set_lhs (new_stmt, reduc_var);
7250             }
7251         }
7252       else
7253         {
7254           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7255                                              tree_code (code), reduc_var, def0,
7256                                              mask);
7257           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7258           /* Remove the statement, so that we can use the same code paths
7259              as for statements that we've just created.  */
7260           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7261           gsi_remove (&tmp_gsi, true);
7262         }
7263
7264       if (i == vec_num - 1)
7265         {
7266           gimple_set_lhs (new_stmt, scalar_dest);
7267           vect_finish_replace_stmt (loop_vinfo,
7268                                     scalar_dest_def_info,
7269                                     new_stmt);
7270         }
7271       else
7272         vect_finish_stmt_generation (loop_vinfo,
7273                                      scalar_dest_def_info,
7274                                      new_stmt, gsi);
7275
7276       if (slp_node)
7277         slp_node->push_vec_def (new_stmt);
7278       else
7279         {
7280           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7281           *vec_stmt = new_stmt;
7282         }
7283     }
7284
7285   return true;
7286 }
7287
7288 /* Function is_nonwrapping_integer_induction.
7289
7290    Check if STMT_VINO (which is part of loop LOOP) both increments and
7291    does not cause overflow.  */
7292
7293 static bool
7294 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7295 {
7296   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7297   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7298   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7299   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7300   widest_int ni, max_loop_value, lhs_max;
7301   wi::overflow_type overflow = wi::OVF_NONE;
7302
7303   /* Make sure the loop is integer based.  */
7304   if (TREE_CODE (base) != INTEGER_CST
7305       || TREE_CODE (step) != INTEGER_CST)
7306     return false;
7307
7308   /* Check that the max size of the loop will not wrap.  */
7309
7310   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7311     return true;
7312
7313   if (! max_stmt_executions (loop, &ni))
7314     return false;
7315
7316   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7317                             &overflow);
7318   if (overflow)
7319     return false;
7320
7321   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7322                             TYPE_SIGN (lhs_type), &overflow);
7323   if (overflow)
7324     return false;
7325
7326   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7327           <= TYPE_PRECISION (lhs_type));
7328 }
7329
7330 /* Check if masking can be supported by inserting a conditional expression.
7331    CODE is the code for the operation.  COND_FN is the conditional internal
7332    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
7333 static bool
7334 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7335                          tree vectype_in)
7336 {
7337   if (cond_fn != IFN_LAST
7338       && direct_internal_fn_supported_p (cond_fn, vectype_in,
7339                                          OPTIMIZE_FOR_SPEED))
7340     return false;
7341
7342   if (code.is_tree_code ())
7343     switch (tree_code (code))
7344       {
7345       case DOT_PROD_EXPR:
7346       case SAD_EXPR:
7347         return true;
7348
7349       default:
7350         break;
7351       }
7352   return false;
7353 }
7354
7355 /* Insert a conditional expression to enable masked vectorization.  CODE is the
7356    code for the operation.  VOP is the array of operands.  MASK is the loop
7357    mask.  GSI is a statement iterator used to place the new conditional
7358    expression.  */
7359 static void
7360 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7361                       gimple_stmt_iterator *gsi)
7362 {
7363   switch (tree_code (code))
7364     {
7365     case DOT_PROD_EXPR:
7366       {
7367         tree vectype = TREE_TYPE (vop[1]);
7368         tree zero = build_zero_cst (vectype);
7369         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7370         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7371                                                mask, vop[1], zero);
7372         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7373         vop[1] = masked_op1;
7374         break;
7375       }
7376
7377     case SAD_EXPR:
7378       {
7379         tree vectype = TREE_TYPE (vop[1]);
7380         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7381         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7382                                                mask, vop[1], vop[0]);
7383         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7384         vop[1] = masked_op1;
7385         break;
7386       }
7387
7388     default:
7389       gcc_unreachable ();
7390     }
7391 }
7392
7393 /* Function vectorizable_reduction.
7394
7395    Check if STMT_INFO performs a reduction operation that can be vectorized.
7396    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7397    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7398    Return true if STMT_INFO is vectorizable in this way.
7399
7400    This function also handles reduction idioms (patterns) that have been
7401    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
7402    may be of this form:
7403      X = pattern_expr (arg0, arg1, ..., X)
7404    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7405    sequence that had been detected and replaced by the pattern-stmt
7406    (STMT_INFO).
7407
7408    This function also handles reduction of condition expressions, for example:
7409      for (int i = 0; i < N; i++)
7410        if (a[i] < value)
7411          last = a[i];
7412    This is handled by vectorising the loop and creating an additional vector
7413    containing the loop indexes for which "a[i] < value" was true.  In the
7414    function epilogue this is reduced to a single max value and then used to
7415    index into the vector of results.
7416
7417    In some cases of reduction patterns, the type of the reduction variable X is
7418    different than the type of the other arguments of STMT_INFO.
7419    In such cases, the vectype that is used when transforming STMT_INFO into
7420    a vector stmt is different than the vectype that is used to determine the
7421    vectorization factor, because it consists of a different number of elements
7422    than the actual number of elements that are being operated upon in parallel.
7423
7424    For example, consider an accumulation of shorts into an int accumulator.
7425    On some targets it's possible to vectorize this pattern operating on 8
7426    shorts at a time (hence, the vectype for purposes of determining the
7427    vectorization factor should be V8HI); on the other hand, the vectype that
7428    is used to create the vector form is actually V4SI (the type of the result).
7429
7430    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7431    indicates what is the actual level of parallelism (V8HI in the example), so
7432    that the right vectorization factor would be derived.  This vectype
7433    corresponds to the type of arguments to the reduction stmt, and should *NOT*
7434    be used to create the vectorized stmt.  The right vectype for the vectorized
7435    stmt is obtained from the type of the result X:
7436       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7437
7438    This means that, contrary to "regular" reductions (or "regular" stmts in
7439    general), the following equation:
7440       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7441    does *NOT* necessarily hold for reduction patterns.  */
7442
7443 bool
7444 vectorizable_reduction (loop_vec_info loop_vinfo,
7445                         stmt_vec_info stmt_info, slp_tree slp_node,
7446                         slp_instance slp_node_instance,
7447                         stmt_vector_for_cost *cost_vec)
7448 {
7449   tree vectype_in = NULL_TREE;
7450   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7451   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7452   stmt_vec_info cond_stmt_vinfo = NULL;
7453   int i;
7454   int ncopies;
7455   bool single_defuse_cycle = false;
7456   bool nested_cycle = false;
7457   bool double_reduc = false;
7458   int vec_num;
7459   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7460   tree cond_reduc_val = NULL_TREE;
7461
7462   /* Make sure it was already recognized as a reduction computation.  */
7463   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7464       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7465       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7466     return false;
7467
7468   /* The stmt we store reduction analysis meta on.  */
7469   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7470   reduc_info->is_reduc_info = true;
7471
7472   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7473     {
7474       if (is_a <gphi *> (stmt_info->stmt))
7475         {
7476           if (slp_node)
7477             {
7478               /* We eventually need to set a vector type on invariant
7479                  arguments.  */
7480               unsigned j;
7481               slp_tree child;
7482               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7483                 if (!vect_maybe_update_slp_op_vectype
7484                        (child, SLP_TREE_VECTYPE (slp_node)))
7485                   {
7486                     if (dump_enabled_p ())
7487                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7488                                        "incompatible vector types for "
7489                                        "invariants\n");
7490                     return false;
7491                   }
7492             }
7493           /* Analysis for double-reduction is done on the outer
7494              loop PHI, nested cycles have no further restrictions.  */
7495           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7496         }
7497       else
7498         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7499       return true;
7500     }
7501
7502   stmt_vec_info orig_stmt_of_analysis = stmt_info;
7503   stmt_vec_info phi_info = stmt_info;
7504   if (!is_a <gphi *> (stmt_info->stmt))
7505     {
7506       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7507       return true;
7508     }
7509   if (slp_node)
7510     {
7511       slp_node_instance->reduc_phis = slp_node;
7512       /* ???  We're leaving slp_node to point to the PHIs, we only
7513          need it to get at the number of vector stmts which wasn't
7514          yet initialized for the instance root.  */
7515     }
7516   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7517     {
7518       use_operand_p use_p;
7519       gimple *use_stmt;
7520       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7521                                  &use_p, &use_stmt);
7522       gcc_assert (res);
7523       phi_info = loop_vinfo->lookup_stmt (use_stmt);
7524     }
7525
7526   /* PHIs should not participate in patterns.  */
7527   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7528   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7529
7530   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7531      and compute the reduction chain length.  Discover the real
7532      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
7533   tree reduc_def
7534     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7535                              loop_latch_edge
7536                                (gimple_bb (reduc_def_phi)->loop_father));
7537   unsigned reduc_chain_length = 0;
7538   bool only_slp_reduc_chain = true;
7539   stmt_info = NULL;
7540   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7541   while (reduc_def != PHI_RESULT (reduc_def_phi))
7542     {
7543       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7544       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7545       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7546         {
7547           if (dump_enabled_p ())
7548             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7549                              "reduction chain broken by patterns.\n");
7550           return false;
7551         }
7552       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7553         only_slp_reduc_chain = false;
7554       /* For epilogue generation live members of the chain need
7555          to point back to the PHI via their original stmt for
7556          info_for_reduction to work.  For SLP we need to look at
7557          all lanes here - even though we only will vectorize from
7558          the SLP node with live lane zero the other live lanes also
7559          need to be identified as part of a reduction to be able
7560          to skip code generation for them.  */
7561       if (slp_for_stmt_info)
7562         {
7563           for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7564             if (STMT_VINFO_LIVE_P (s))
7565               STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7566         }
7567       else if (STMT_VINFO_LIVE_P (vdef))
7568         STMT_VINFO_REDUC_DEF (def) = phi_info;
7569       gimple_match_op op;
7570       if (!gimple_extract_op (vdef->stmt, &op))
7571         {
7572           if (dump_enabled_p ())
7573             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7574                              "reduction chain includes unsupported"
7575                              " statement type.\n");
7576           return false;
7577         }
7578       if (CONVERT_EXPR_CODE_P (op.code))
7579         {
7580           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7581             {
7582               if (dump_enabled_p ())
7583                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7584                                  "conversion in the reduction chain.\n");
7585               return false;
7586             }
7587         }
7588       else if (!stmt_info)
7589         /* First non-conversion stmt.  */
7590         stmt_info = vdef;
7591       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7592       reduc_chain_length++;
7593       if (!stmt_info && slp_node)
7594         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7595     }
7596   /* PHIs should not participate in patterns.  */
7597   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7598
7599   if (nested_in_vect_loop_p (loop, stmt_info))
7600     {
7601       loop = loop->inner;
7602       nested_cycle = true;
7603     }
7604
7605   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7606      element.  */
7607   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7608     {
7609       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7610       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7611     }
7612   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7613     gcc_assert (slp_node
7614                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7615
7616   /* 1. Is vectorizable reduction?  */
7617   /* Not supportable if the reduction variable is used in the loop, unless
7618      it's a reduction chain.  */
7619   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7620       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7621     return false;
7622
7623   /* Reductions that are not used even in an enclosing outer-loop,
7624      are expected to be "live" (used out of the loop).  */
7625   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7626       && !STMT_VINFO_LIVE_P (stmt_info))
7627     return false;
7628
7629   /* 2. Has this been recognized as a reduction pattern?
7630
7631      Check if STMT represents a pattern that has been recognized
7632      in earlier analysis stages.  For stmts that represent a pattern,
7633      the STMT_VINFO_RELATED_STMT field records the last stmt in
7634      the original sequence that constitutes the pattern.  */
7635
7636   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7637   if (orig_stmt_info)
7638     {
7639       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7640       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7641     }
7642
7643   /* 3. Check the operands of the operation.  The first operands are defined
7644         inside the loop body. The last operand is the reduction variable,
7645         which is defined by the loop-header-phi.  */
7646
7647   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7648   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7649   gimple_match_op op;
7650   if (!gimple_extract_op (stmt_info->stmt, &op))
7651     gcc_unreachable ();
7652   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7653                             || op.code == WIDEN_SUM_EXPR
7654                             || op.code == SAD_EXPR);
7655
7656   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7657       && !SCALAR_FLOAT_TYPE_P (op.type))
7658     return false;
7659
7660   /* Do not try to vectorize bit-precision reductions.  */
7661   if (!type_has_mode_precision_p (op.type))
7662     return false;
7663
7664   /* For lane-reducing ops we're reducing the number of reduction PHIs
7665      which means the only use of that may be in the lane-reducing operation.  */
7666   if (lane_reduc_code_p
7667       && reduc_chain_length != 1
7668       && !only_slp_reduc_chain)
7669     {
7670       if (dump_enabled_p ())
7671         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7672                          "lane-reducing reduction with extra stmts.\n");
7673       return false;
7674     }
7675
7676   /* All uses but the last are expected to be defined in the loop.
7677      The last use is the reduction variable.  In case of nested cycle this
7678      assumption is not true: we use reduc_index to record the index of the
7679      reduction variable.  */
7680   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7681   tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7682   /* We need to skip an extra operand for COND_EXPRs with embedded
7683      comparison.  */
7684   unsigned opno_adjust = 0;
7685   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7686     opno_adjust = 1;
7687   for (i = 0; i < (int) op.num_ops; i++)
7688     {
7689       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
7690       if (i == 0 && op.code == COND_EXPR)
7691         continue;
7692
7693       stmt_vec_info def_stmt_info;
7694       enum vect_def_type dt;
7695       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7696                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7697                                &vectype_op[i], &def_stmt_info))
7698         {
7699           if (dump_enabled_p ())
7700             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7701                              "use not simple.\n");
7702           return false;
7703         }
7704       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7705         continue;
7706
7707       /* For an IFN_COND_OP we might hit the reduction definition operand
7708          twice (once as definition, once as else).  */
7709       if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7710         continue;
7711
7712       /* There should be only one cycle def in the stmt, the one
7713          leading to reduc_def.  */
7714       if (VECTORIZABLE_CYCLE_DEF (dt))
7715         return false;
7716
7717       if (!vectype_op[i])
7718         vectype_op[i]
7719           = get_vectype_for_scalar_type (loop_vinfo,
7720                                          TREE_TYPE (op.ops[i]), slp_op[i]);
7721
7722       /* To properly compute ncopies we are interested in the widest
7723          non-reduction input type in case we're looking at a widening
7724          accumulation that we later handle in vect_transform_reduction.  */
7725       if (lane_reduc_code_p
7726           && vectype_op[i]
7727           && (!vectype_in
7728               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7729                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7730         vectype_in = vectype_op[i];
7731
7732       if (op.code == COND_EXPR)
7733         {
7734           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
7735           if (dt == vect_constant_def)
7736             {
7737               cond_reduc_dt = dt;
7738               cond_reduc_val = op.ops[i];
7739             }
7740           if (dt == vect_induction_def
7741               && def_stmt_info
7742               && is_nonwrapping_integer_induction (def_stmt_info, loop))
7743             {
7744               cond_reduc_dt = dt;
7745               cond_stmt_vinfo = def_stmt_info;
7746             }
7747         }
7748     }
7749   if (!vectype_in)
7750     vectype_in = STMT_VINFO_VECTYPE (phi_info);
7751   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7752
7753   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7754   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7755   /* If we have a condition reduction, see if we can simplify it further.  */
7756   if (v_reduc_type == COND_REDUCTION)
7757     {
7758       if (slp_node)
7759         return false;
7760
7761       /* When the condition uses the reduction value in the condition, fail.  */
7762       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7763         {
7764           if (dump_enabled_p ())
7765             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7766                              "condition depends on previous iteration\n");
7767           return false;
7768         }
7769
7770       if (reduc_chain_length == 1
7771           && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7772                                               OPTIMIZE_FOR_SPEED)
7773               || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7774                                                  vectype_in,
7775                                                  OPTIMIZE_FOR_SPEED)))
7776         {
7777           if (dump_enabled_p ())
7778             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7779                              "optimizing condition reduction with"
7780                              " FOLD_EXTRACT_LAST.\n");
7781           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7782         }
7783       else if (cond_reduc_dt == vect_induction_def)
7784         {
7785           tree base
7786             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7787           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7788
7789           gcc_assert (TREE_CODE (base) == INTEGER_CST
7790                       && TREE_CODE (step) == INTEGER_CST);
7791           cond_reduc_val = NULL_TREE;
7792           enum tree_code cond_reduc_op_code = ERROR_MARK;
7793           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7794           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7795             ;
7796           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7797              above base; punt if base is the minimum value of the type for
7798              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
7799           else if (tree_int_cst_sgn (step) == -1)
7800             {
7801               cond_reduc_op_code = MIN_EXPR;
7802               if (tree_int_cst_sgn (base) == -1)
7803                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7804               else if (tree_int_cst_lt (base,
7805                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
7806                 cond_reduc_val
7807                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
7808             }
7809           else
7810             {
7811               cond_reduc_op_code = MAX_EXPR;
7812               if (tree_int_cst_sgn (base) == 1)
7813                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7814               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7815                                         base))
7816                 cond_reduc_val
7817                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
7818             }
7819           if (cond_reduc_val)
7820             {
7821               if (dump_enabled_p ())
7822                 dump_printf_loc (MSG_NOTE, vect_location,
7823                                  "condition expression based on "
7824                                  "integer induction.\n");
7825               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7826               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7827                 = cond_reduc_val;
7828               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7829             }
7830         }
7831       else if (cond_reduc_dt == vect_constant_def)
7832         {
7833           enum vect_def_type cond_initial_dt;
7834           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7835           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7836           if (cond_initial_dt == vect_constant_def
7837               && types_compatible_p (TREE_TYPE (cond_initial_val),
7838                                      TREE_TYPE (cond_reduc_val)))
7839             {
7840               tree e = fold_binary (LE_EXPR, boolean_type_node,
7841                                     cond_initial_val, cond_reduc_val);
7842               if (e && (integer_onep (e) || integer_zerop (e)))
7843                 {
7844                   if (dump_enabled_p ())
7845                     dump_printf_loc (MSG_NOTE, vect_location,
7846                                      "condition expression based on "
7847                                      "compile time constant.\n");
7848                   /* Record reduction code at analysis stage.  */
7849                   STMT_VINFO_REDUC_CODE (reduc_info)
7850                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7851                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7852                 }
7853             }
7854         }
7855     }
7856
7857   if (STMT_VINFO_LIVE_P (phi_info))
7858     return false;
7859
7860   if (slp_node)
7861     ncopies = 1;
7862   else
7863     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7864
7865   gcc_assert (ncopies >= 1);
7866
7867   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7868
7869   if (nested_cycle)
7870     {
7871       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7872                   == vect_double_reduction_def);
7873       double_reduc = true;
7874     }
7875
7876   /* 4.2. Check support for the epilog operation.
7877
7878           If STMT represents a reduction pattern, then the type of the
7879           reduction variable may be different than the type of the rest
7880           of the arguments.  For example, consider the case of accumulation
7881           of shorts into an int accumulator; The original code:
7882                         S1: int_a = (int) short_a;
7883           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7884
7885           was replaced with:
7886                         STMT: int_acc = widen_sum <short_a, int_acc>
7887
7888           This means that:
7889           1. The tree-code that is used to create the vector operation in the
7890              epilog code (that reduces the partial results) is not the
7891              tree-code of STMT, but is rather the tree-code of the original
7892              stmt from the pattern that STMT is replacing.  I.e, in the example
7893              above we want to use 'widen_sum' in the loop, but 'plus' in the
7894              epilog.
7895           2. The type (mode) we use to check available target support
7896              for the vector operation to be created in the *epilog*, is
7897              determined by the type of the reduction variable (in the example
7898              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7899              However the type (mode) we use to check available target support
7900              for the vector operation to be created *inside the loop*, is
7901              determined by the type of the other arguments to STMT (in the
7902              example we'd check this: optab_handler (widen_sum_optab,
7903              vect_short_mode)).
7904
7905           This is contrary to "regular" reductions, in which the types of all
7906           the arguments are the same as the type of the reduction variable.
7907           For "regular" reductions we can therefore use the same vector type
7908           (and also the same tree-code) when generating the epilog code and
7909           when generating the code inside the loop.  */
7910
7911   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7912
7913   /* If conversion might have created a conditional operation like
7914      IFN_COND_ADD already.  Use the internal code for the following checks.  */
7915   if (orig_code.is_internal_fn ())
7916     {
7917       tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7918       orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7919     }
7920
7921   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7922
7923   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7924   if (reduction_type == TREE_CODE_REDUCTION)
7925     {
7926       /* Check whether it's ok to change the order of the computation.
7927          Generally, when vectorizing a reduction we change the order of the
7928          computation.  This may change the behavior of the program in some
7929          cases, so we need to check that this is ok.  One exception is when
7930          vectorizing an outer-loop: the inner-loop is executed sequentially,
7931          and therefore vectorizing reductions in the inner-loop during
7932          outer-loop vectorization is safe.  Likewise when we are vectorizing
7933          a series of reductions using SLP and the VF is one the reductions
7934          are performed in scalar order.  */
7935       if (slp_node
7936           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7937           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7938         ;
7939       else if (needs_fold_left_reduction_p (op.type, orig_code))
7940         {
7941           /* When vectorizing a reduction chain w/o SLP the reduction PHI
7942              is not directy used in stmt.  */
7943           if (!only_slp_reduc_chain
7944               && reduc_chain_length != 1)
7945             {
7946               if (dump_enabled_p ())
7947                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7948                                  "in-order reduction chain without SLP.\n");
7949               return false;
7950             }
7951           STMT_VINFO_REDUC_TYPE (reduc_info)
7952             = reduction_type = FOLD_LEFT_REDUCTION;
7953         }
7954       else if (!commutative_binary_op_p (orig_code, op.type)
7955                || !associative_binary_op_p (orig_code, op.type))
7956         {
7957           if (dump_enabled_p ())
7958             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7959                             "reduction: not commutative/associative\n");
7960           return false;
7961         }
7962     }
7963
7964   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7965       && ncopies > 1)
7966     {
7967       if (dump_enabled_p ())
7968         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7969                          "multiple types in double reduction or condition "
7970                          "reduction or fold-left reduction.\n");
7971       return false;
7972     }
7973
7974   internal_fn reduc_fn = IFN_LAST;
7975   if (reduction_type == TREE_CODE_REDUCTION
7976       || reduction_type == FOLD_LEFT_REDUCTION
7977       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7978       || reduction_type == CONST_COND_REDUCTION)
7979     {
7980       if (reduction_type == FOLD_LEFT_REDUCTION
7981           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7982           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7983         {
7984           if (reduc_fn != IFN_LAST
7985               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7986                                                   OPTIMIZE_FOR_SPEED))
7987             {
7988               if (dump_enabled_p ())
7989                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7990                                  "reduc op not supported by target.\n");
7991
7992               reduc_fn = IFN_LAST;
7993             }
7994         }
7995       else
7996         {
7997           if (!nested_cycle || double_reduc)
7998             {
7999               if (dump_enabled_p ())
8000                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8001                                  "no reduc code for scalar code.\n");
8002
8003               return false;
8004             }
8005         }
8006     }
8007   else if (reduction_type == COND_REDUCTION)
8008     {
8009       int scalar_precision
8010         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
8011       cr_index_scalar_type = make_unsigned_type (scalar_precision);
8012       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
8013                                                 vectype_out);
8014
8015       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
8016                                           OPTIMIZE_FOR_SPEED))
8017         reduc_fn = IFN_REDUC_MAX;
8018     }
8019   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
8020
8021   if (reduction_type != EXTRACT_LAST_REDUCTION
8022       && (!nested_cycle || double_reduc)
8023       && reduc_fn == IFN_LAST
8024       && !nunits_out.is_constant ())
8025     {
8026       if (dump_enabled_p ())
8027         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8028                          "missing target support for reduction on"
8029                          " variable-length vectors.\n");
8030       return false;
8031     }
8032
8033   /* For SLP reductions, see if there is a neutral value we can use.  */
8034   tree neutral_op = NULL_TREE;
8035   if (slp_node)
8036     {
8037       tree initial_value = NULL_TREE;
8038       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
8039         initial_value = vect_phi_initial_value (reduc_def_phi);
8040       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8041                                              orig_code, initial_value);
8042     }
8043
8044   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
8045     {
8046       /* We can't support in-order reductions of code such as this:
8047
8048            for (int i = 0; i < n1; ++i)
8049              for (int j = 0; j < n2; ++j)
8050                l += a[j];
8051
8052          since GCC effectively transforms the loop when vectorizing:
8053
8054            for (int i = 0; i < n1 / VF; ++i)
8055              for (int j = 0; j < n2; ++j)
8056                for (int k = 0; k < VF; ++k)
8057                  l += a[j];
8058
8059          which is a reassociation of the original operation.  */
8060       if (dump_enabled_p ())
8061         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8062                          "in-order double reduction not supported.\n");
8063
8064       return false;
8065     }
8066
8067   if (reduction_type == FOLD_LEFT_REDUCTION
8068       && slp_node
8069       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8070     {
8071       /* We cannot use in-order reductions in this case because there is
8072          an implicit reassociation of the operations involved.  */
8073       if (dump_enabled_p ())
8074         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8075                          "in-order unchained SLP reductions not supported.\n");
8076       return false;
8077     }
8078
8079   /* For double reductions, and for SLP reductions with a neutral value,
8080      we construct a variable-length initial vector by loading a vector
8081      full of the neutral value and then shift-and-inserting the start
8082      values into the low-numbered elements.  */
8083   if ((double_reduc || neutral_op)
8084       && !nunits_out.is_constant ()
8085       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8086                                           vectype_out, OPTIMIZE_FOR_SPEED))
8087     {
8088       if (dump_enabled_p ())
8089         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8090                          "reduction on variable-length vectors requires"
8091                          " target support for a vector-shift-and-insert"
8092                          " operation.\n");
8093       return false;
8094     }
8095
8096   /* Check extra constraints for variable-length unchained SLP reductions.  */
8097   if (slp_node
8098       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8099       && !nunits_out.is_constant ())
8100     {
8101       /* We checked above that we could build the initial vector when
8102          there's a neutral element value.  Check here for the case in
8103          which each SLP statement has its own initial value and in which
8104          that value needs to be repeated for every instance of the
8105          statement within the initial vector.  */
8106       unsigned int group_size = SLP_TREE_LANES (slp_node);
8107       if (!neutral_op
8108           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8109                                               TREE_TYPE (vectype_out)))
8110         {
8111           if (dump_enabled_p ())
8112             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8113                              "unsupported form of SLP reduction for"
8114                              " variable-length vectors: cannot build"
8115                              " initial vector.\n");
8116           return false;
8117         }
8118       /* The epilogue code relies on the number of elements being a multiple
8119          of the group size.  The duplicate-and-interleave approach to setting
8120          up the initial vector does too.  */
8121       if (!multiple_p (nunits_out, group_size))
8122         {
8123           if (dump_enabled_p ())
8124             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8125                              "unsupported form of SLP reduction for"
8126                              " variable-length vectors: the vector size"
8127                              " is not a multiple of the number of results.\n");
8128           return false;
8129         }
8130     }
8131
8132   if (reduction_type == COND_REDUCTION)
8133     {
8134       widest_int ni;
8135
8136       if (! max_loop_iterations (loop, &ni))
8137         {
8138           if (dump_enabled_p ())
8139             dump_printf_loc (MSG_NOTE, vect_location,
8140                              "loop count not known, cannot create cond "
8141                              "reduction.\n");
8142           return false;
8143         }
8144       /* Convert backedges to iterations.  */
8145       ni += 1;
8146
8147       /* The additional index will be the same type as the condition.  Check
8148          that the loop can fit into this less one (because we'll use up the
8149          zero slot for when there are no matches).  */
8150       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8151       if (wi::geu_p (ni, wi::to_widest (max_index)))
8152         {
8153           if (dump_enabled_p ())
8154             dump_printf_loc (MSG_NOTE, vect_location,
8155                              "loop size is greater than data size.\n");
8156           return false;
8157         }
8158     }
8159
8160   /* In case the vectorization factor (VF) is bigger than the number
8161      of elements that we can fit in a vectype (nunits), we have to generate
8162      more than one vector stmt - i.e - we need to "unroll" the
8163      vector stmt by a factor VF/nunits.  For more details see documentation
8164      in vectorizable_operation.  */
8165
8166   /* If the reduction is used in an outer loop we need to generate
8167      VF intermediate results, like so (e.g. for ncopies=2):
8168         r0 = phi (init, r0)
8169         r1 = phi (init, r1)
8170         r0 = x0 + r0;
8171         r1 = x1 + r1;
8172     (i.e. we generate VF results in 2 registers).
8173     In this case we have a separate def-use cycle for each copy, and therefore
8174     for each copy we get the vector def for the reduction variable from the
8175     respective phi node created for this copy.
8176
8177     Otherwise (the reduction is unused in the loop nest), we can combine
8178     together intermediate results, like so (e.g. for ncopies=2):
8179         r = phi (init, r)
8180         r = x0 + r;
8181         r = x1 + r;
8182    (i.e. we generate VF/2 results in a single register).
8183    In this case for each copy we get the vector def for the reduction variable
8184    from the vectorized reduction operation generated in the previous iteration.
8185
8186    This only works when we see both the reduction PHI and its only consumer
8187    in vectorizable_reduction and there are no intermediate stmts
8188    participating.  When unrolling we want each unrolled iteration to have its
8189    own reduction accumulator since one of the main goals of unrolling a
8190    reduction is to reduce the aggregate loop-carried latency.  */
8191   if (ncopies > 1
8192       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8193       && reduc_chain_length == 1
8194       && loop_vinfo->suggested_unroll_factor == 1)
8195     single_defuse_cycle = true;
8196
8197   if (single_defuse_cycle || lane_reduc_code_p)
8198     {
8199       gcc_assert (op.code != COND_EXPR);
8200
8201       /* 4. Supportable by target?  */
8202       bool ok = true;
8203
8204       /* 4.1. check support for the operation in the loop
8205
8206          This isn't necessary for the lane reduction codes, since they
8207          can only be produced by pattern matching, and it's up to the
8208          pattern matcher to test for support.  The main reason for
8209          specifically skipping this step is to avoid rechecking whether
8210          mixed-sign dot-products can be implemented using signed
8211          dot-products.  */
8212       machine_mode vec_mode = TYPE_MODE (vectype_in);
8213       if (!lane_reduc_code_p
8214           && !directly_supported_p (op.code, vectype_in, optab_vector))
8215         {
8216           if (dump_enabled_p ())
8217             dump_printf (MSG_NOTE, "op not supported by target.\n");
8218           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8219               || !vect_can_vectorize_without_simd_p (op.code))
8220             ok = false;
8221           else
8222             if (dump_enabled_p ())
8223               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8224         }
8225
8226       if (vect_emulated_vector_p (vectype_in)
8227           && !vect_can_vectorize_without_simd_p (op.code))
8228         {
8229           if (dump_enabled_p ())
8230             dump_printf (MSG_NOTE, "using word mode not possible.\n");
8231           return false;
8232         }
8233
8234       /* lane-reducing operations have to go through vect_transform_reduction.
8235          For the other cases try without the single cycle optimization.  */
8236       if (!ok)
8237         {
8238           if (lane_reduc_code_p)
8239             return false;
8240           else
8241             single_defuse_cycle = false;
8242         }
8243     }
8244   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8245
8246   /* If the reduction stmt is one of the patterns that have lane
8247      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
8248   if ((ncopies > 1 && ! single_defuse_cycle)
8249       && lane_reduc_code_p)
8250     {
8251       if (dump_enabled_p ())
8252         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8253                          "multi def-use cycle not possible for lane-reducing "
8254                          "reduction operation\n");
8255       return false;
8256     }
8257
8258   if (slp_node
8259       && !(!single_defuse_cycle
8260            && !lane_reduc_code_p
8261            && reduction_type != FOLD_LEFT_REDUCTION))
8262     for (i = 0; i < (int) op.num_ops; i++)
8263       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8264         {
8265           if (dump_enabled_p ())
8266             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8267                              "incompatible vector types for invariants\n");
8268           return false;
8269         }
8270
8271   if (slp_node)
8272     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8273   else
8274     vec_num = 1;
8275
8276   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8277                              reduction_type, ncopies, cost_vec);
8278   /* Cost the reduction op inside the loop if transformed via
8279      vect_transform_reduction.  Otherwise this is costed by the
8280      separate vectorizable_* routines.  */
8281   if (single_defuse_cycle || lane_reduc_code_p)
8282     {
8283       int factor = 1;
8284       if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8285         /* Three dot-products and a subtraction.  */
8286         factor = 4;
8287       record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8288                         stmt_info, 0, vect_body);
8289     }
8290
8291   if (dump_enabled_p ()
8292       && reduction_type == FOLD_LEFT_REDUCTION)
8293     dump_printf_loc (MSG_NOTE, vect_location,
8294                      "using an in-order (fold-left) reduction.\n");
8295   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8296   /* All but single defuse-cycle optimized, lane-reducing and fold-left
8297      reductions go through their own vectorizable_* routines.  */
8298   if (!single_defuse_cycle
8299       && !lane_reduc_code_p
8300       && reduction_type != FOLD_LEFT_REDUCTION)
8301     {
8302       stmt_vec_info tem
8303         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8304       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8305         {
8306           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8307           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8308         }
8309       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8310       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8311     }
8312   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8313     {
8314       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8315       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8316       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8317
8318       if (reduction_type != FOLD_LEFT_REDUCTION
8319           && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8320           && (cond_fn == IFN_LAST
8321               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8322                                                   OPTIMIZE_FOR_SPEED)))
8323         {
8324           if (dump_enabled_p ())
8325             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8326                              "can't operate on partial vectors because"
8327                              " no conditional operation is available.\n");
8328           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8329         }
8330       else if (reduction_type == FOLD_LEFT_REDUCTION
8331                && reduc_fn == IFN_LAST
8332                && !expand_vec_cond_expr_p (vectype_in,
8333                                            truth_type_for (vectype_in),
8334                                            SSA_NAME))
8335         {
8336           if (dump_enabled_p ())
8337             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8338                              "can't operate on partial vectors because"
8339                              " no conditional operation is available.\n");
8340           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8341         }
8342       else if (reduction_type == FOLD_LEFT_REDUCTION
8343                && internal_fn_mask_index (reduc_fn) == -1
8344                && FLOAT_TYPE_P (vectype_in)
8345                && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8346         {
8347           if (dump_enabled_p ())
8348             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8349                              "can't operate on partial vectors because"
8350                              " signed zeros cannot be preserved.\n");
8351           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8352         }
8353       else
8354         {
8355           internal_fn mask_reduc_fn
8356             = get_masked_reduction_fn (reduc_fn, vectype_in);
8357
8358           if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8359             vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8360                                   vectype_in, 1);
8361           else
8362             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8363                                    vectype_in, NULL);
8364         }
8365     }
8366   return true;
8367 }
8368
8369 /* STMT_INFO is a dot-product reduction whose multiplication operands
8370    have different signs.  Emit a sequence to emulate the operation
8371    using a series of signed DOT_PROD_EXPRs and return the last
8372    statement generated.  VEC_DEST is the result of the vector operation
8373    and VOP lists its inputs.  */
8374
8375 static gassign *
8376 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8377                              gimple_stmt_iterator *gsi, tree vec_dest,
8378                              tree vop[3])
8379 {
8380   tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8381   tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8382   tree narrow_elttype = TREE_TYPE (narrow_vectype);
8383   gimple *new_stmt;
8384
8385   /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
8386   if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8387     std::swap (vop[0], vop[1]);
8388
8389   /* Convert all inputs to signed types.  */
8390   for (int i = 0; i < 3; ++i)
8391     if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8392       {
8393         tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8394         new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8395         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8396         vop[i] = tmp;
8397       }
8398
8399   /* In the comments below we assume 8-bit inputs for simplicity,
8400      but the approach works for any full integer type.  */
8401
8402   /* Create a vector of -128.  */
8403   tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8404   tree min_narrow = build_vector_from_val (narrow_vectype,
8405                                            min_narrow_elttype);
8406
8407   /* Create a vector of 64.  */
8408   auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8409   tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8410   half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8411
8412   /* Emit: SUB_RES = VOP[0] - 128.  */
8413   tree sub_res = make_ssa_name (narrow_vectype);
8414   new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8415   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8416
8417   /* Emit:
8418
8419        STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8420        STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8421        STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8422
8423      on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8424      Doing the two 64 * y steps first allows more time to compute x.  */
8425   tree stage1 = make_ssa_name (wide_vectype);
8426   new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8427                                   vop[1], half_narrow, vop[2]);
8428   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8429
8430   tree stage2 = make_ssa_name (wide_vectype);
8431   new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8432                                   vop[1], half_narrow, stage1);
8433   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8434
8435   tree stage3 = make_ssa_name (wide_vectype);
8436   new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8437                                   sub_res, vop[1], stage2);
8438   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8439
8440   /* Convert STAGE3 to the reduction type.  */
8441   return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8442 }
8443
8444 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8445    value.  */
8446
8447 bool
8448 vect_transform_reduction (loop_vec_info loop_vinfo,
8449                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8450                           gimple **vec_stmt, slp_tree slp_node)
8451 {
8452   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8453   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8454   int i;
8455   int ncopies;
8456   int vec_num;
8457
8458   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8459   gcc_assert (reduc_info->is_reduc_info);
8460
8461   if (nested_in_vect_loop_p (loop, stmt_info))
8462     {
8463       loop = loop->inner;
8464       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8465     }
8466
8467   gimple_match_op op;
8468   if (!gimple_extract_op (stmt_info->stmt, &op))
8469     gcc_unreachable ();
8470
8471   /* All uses but the last are expected to be defined in the loop.
8472      The last use is the reduction variable.  In case of nested cycle this
8473      assumption is not true: we use reduc_index to record the index of the
8474      reduction variable.  */
8475   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8476   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8477   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8478   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8479
8480   if (slp_node)
8481     {
8482       ncopies = 1;
8483       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8484     }
8485   else
8486     {
8487       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8488       vec_num = 1;
8489     }
8490
8491   code_helper code = canonicalize_code (op.code, op.type);
8492   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8493
8494   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8495   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8496   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8497
8498   /* Transform.  */
8499   tree new_temp = NULL_TREE;
8500   auto_vec<tree> vec_oprnds0;
8501   auto_vec<tree> vec_oprnds1;
8502   auto_vec<tree> vec_oprnds2;
8503   tree def0;
8504
8505   if (dump_enabled_p ())
8506     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8507
8508   /* FORNOW: Multiple types are not supported for condition.  */
8509   if (code == COND_EXPR)
8510     gcc_assert (ncopies == 1);
8511
8512   /* A binary COND_OP reduction must have the same definition and else
8513      value. */
8514   bool cond_fn_p = code.is_internal_fn ()
8515     && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8516   if (cond_fn_p)
8517     {
8518       gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8519                   || code == IFN_COND_MUL || code == IFN_COND_AND
8520                   || code == IFN_COND_IOR || code == IFN_COND_XOR);
8521       gcc_assert (op.num_ops == 4
8522                   && (op.ops[reduc_index]
8523                       == op.ops[internal_fn_else_index ((internal_fn) code)]));
8524     }
8525
8526   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8527
8528   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8529   if (reduction_type == FOLD_LEFT_REDUCTION)
8530     {
8531       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8532       gcc_assert (code.is_tree_code () || cond_fn_p);
8533       return vectorize_fold_left_reduction
8534           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8535            code, reduc_fn, op.ops, op.num_ops, vectype_in,
8536            reduc_index, masks, lens);
8537     }
8538
8539   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8540   gcc_assert (single_defuse_cycle
8541               || code == DOT_PROD_EXPR
8542               || code == WIDEN_SUM_EXPR
8543               || code == SAD_EXPR);
8544
8545   /* Create the destination vector  */
8546   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8547   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8548
8549   /* Get NCOPIES vector definitions for all operands except the reduction
8550      definition.  */
8551   if (!cond_fn_p)
8552     {
8553       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8554                          single_defuse_cycle && reduc_index == 0
8555                          ? NULL_TREE : op.ops[0], &vec_oprnds0,
8556                          single_defuse_cycle && reduc_index == 1
8557                          ? NULL_TREE : op.ops[1], &vec_oprnds1,
8558                          op.num_ops == 3
8559                          && !(single_defuse_cycle && reduc_index == 2)
8560                          ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8561     }
8562   else
8563     {
8564       /* For a conditional operation pass the truth type as mask
8565          vectype.  */
8566       gcc_assert (single_defuse_cycle
8567                   && (reduc_index == 1 || reduc_index == 2));
8568       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8569                          op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
8570                          reduc_index == 1 ? NULL_TREE : op.ops[1],
8571                          NULL_TREE, &vec_oprnds1,
8572                          reduc_index == 2 ? NULL_TREE : op.ops[2],
8573                          NULL_TREE, &vec_oprnds2);
8574     }
8575
8576   /* For single def-use cycles get one copy of the vectorized reduction
8577      definition.  */
8578   if (single_defuse_cycle)
8579     {
8580       gcc_assert (!slp_node);
8581       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8582                                      op.ops[reduc_index],
8583                                      reduc_index == 0 ? &vec_oprnds0
8584                                      : (reduc_index == 1 ? &vec_oprnds1
8585                                         : &vec_oprnds2));
8586     }
8587
8588   bool emulated_mixed_dot_prod
8589     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8590   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8591     {
8592       gimple *new_stmt;
8593       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8594       if (masked_loop_p && !mask_by_cond_expr)
8595         {
8596           /* No conditional ifns have been defined for dot-product yet.  */
8597           gcc_assert (code != DOT_PROD_EXPR);
8598
8599           /* Make sure that the reduction accumulator is vop[0].  */
8600           if (reduc_index == 1)
8601             {
8602               gcc_assert (commutative_binary_op_p (code, op.type));
8603               std::swap (vop[0], vop[1]);
8604             }
8605           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8606                                           vec_num * ncopies, vectype_in, i);
8607           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8608                                                     vop[0], vop[1], vop[0]);
8609           new_temp = make_ssa_name (vec_dest, call);
8610           gimple_call_set_lhs (call, new_temp);
8611           gimple_call_set_nothrow (call, true);
8612           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8613           new_stmt = call;
8614         }
8615       else
8616         {
8617           if (op.num_ops >= 3)
8618             vop[2] = vec_oprnds2[i];
8619
8620           if (masked_loop_p && mask_by_cond_expr)
8621             {
8622               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8623                                               vec_num * ncopies, vectype_in, i);
8624               build_vect_cond_expr (code, vop, mask, gsi);
8625             }
8626
8627           if (emulated_mixed_dot_prod)
8628             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8629                                                     vec_dest, vop);
8630
8631           else if (code.is_internal_fn () && !cond_fn_p)
8632             new_stmt = gimple_build_call_internal (internal_fn (code),
8633                                                    op.num_ops,
8634                                                    vop[0], vop[1], vop[2]);
8635           else if (code.is_internal_fn () && cond_fn_p)
8636             new_stmt = gimple_build_call_internal (internal_fn (code),
8637                                                    op.num_ops,
8638                                                    vop[0], vop[1], vop[2],
8639                                                    vop[1]);
8640           else
8641             new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8642                                             vop[0], vop[1], vop[2]);
8643           new_temp = make_ssa_name (vec_dest, new_stmt);
8644           gimple_set_lhs (new_stmt, new_temp);
8645           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8646         }
8647
8648       if (slp_node)
8649         slp_node->push_vec_def (new_stmt);
8650       else if (single_defuse_cycle
8651                && i < ncopies - 1)
8652         {
8653           if (reduc_index == 0)
8654             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8655           else if (reduc_index == 1)
8656             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8657           else if (reduc_index == 2)
8658             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8659         }
8660       else
8661         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8662     }
8663
8664   if (!slp_node)
8665     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8666
8667   return true;
8668 }
8669
8670 /* Transform phase of a cycle PHI.  */
8671
8672 bool
8673 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8674                           stmt_vec_info stmt_info, gimple **vec_stmt,
8675                           slp_tree slp_node, slp_instance slp_node_instance)
8676 {
8677   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8678   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8679   int i;
8680   int ncopies;
8681   int j;
8682   bool nested_cycle = false;
8683   int vec_num;
8684
8685   if (nested_in_vect_loop_p (loop, stmt_info))
8686     {
8687       loop = loop->inner;
8688       nested_cycle = true;
8689     }
8690
8691   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8692   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8693   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8694   gcc_assert (reduc_info->is_reduc_info);
8695
8696   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8697       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8698     /* Leave the scalar phi in place.  */
8699     return true;
8700
8701   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8702   /* For a nested cycle we do not fill the above.  */
8703   if (!vectype_in)
8704     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8705   gcc_assert (vectype_in);
8706
8707   if (slp_node)
8708     {
8709       /* The size vect_schedule_slp_instance computes is off for us.  */
8710       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8711                                       * SLP_TREE_LANES (slp_node), vectype_in);
8712       ncopies = 1;
8713     }
8714   else
8715     {
8716       vec_num = 1;
8717       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8718     }
8719
8720   /* Check whether we should use a single PHI node and accumulate
8721      vectors to one before the backedge.  */
8722   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8723     ncopies = 1;
8724
8725   /* Create the destination vector  */
8726   gphi *phi = as_a <gphi *> (stmt_info->stmt);
8727   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8728                                                vectype_out);
8729
8730   /* Get the loop-entry arguments.  */
8731   tree vec_initial_def = NULL_TREE;
8732   auto_vec<tree> vec_initial_defs;
8733   if (slp_node)
8734     {
8735       vec_initial_defs.reserve (vec_num);
8736       if (nested_cycle)
8737         {
8738           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8739           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8740                              &vec_initial_defs);
8741         }
8742       else
8743         {
8744           gcc_assert (slp_node == slp_node_instance->reduc_phis);
8745           vec<tree> &initial_values = reduc_info->reduc_initial_values;
8746           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8747
8748           unsigned int num_phis = stmts.length ();
8749           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8750             num_phis = 1;
8751           initial_values.reserve (num_phis);
8752           for (unsigned int i = 0; i < num_phis; ++i)
8753             {
8754               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8755               initial_values.quick_push (vect_phi_initial_value (this_phi));
8756             }
8757           if (vec_num == 1)
8758             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8759           if (!initial_values.is_empty ())
8760             {
8761               tree initial_value
8762                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8763               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8764               tree neutral_op
8765                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8766                                             code, initial_value);
8767               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8768                                               &vec_initial_defs, vec_num,
8769                                               stmts.length (), neutral_op);
8770             }
8771         }
8772     }
8773   else
8774     {
8775       /* Get at the scalar def before the loop, that defines the initial
8776          value of the reduction variable.  */
8777       tree initial_def = vect_phi_initial_value (phi);
8778       reduc_info->reduc_initial_values.safe_push (initial_def);
8779       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8780          and we can't use zero for induc_val, use initial_def.  Similarly
8781          for REDUC_MIN and initial_def larger than the base.  */
8782       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8783         {
8784           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8785           if (TREE_CODE (initial_def) == INTEGER_CST
8786               && !integer_zerop (induc_val)
8787               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8788                    && tree_int_cst_lt (initial_def, induc_val))
8789                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8790                       && tree_int_cst_lt (induc_val, initial_def))))
8791             {
8792               induc_val = initial_def;
8793               /* Communicate we used the initial_def to epilouge
8794                  generation.  */
8795               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8796             }
8797           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8798         }
8799       else if (nested_cycle)
8800         {
8801           /* Do not use an adjustment def as that case is not supported
8802              correctly if ncopies is not one.  */
8803           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8804                                          ncopies, initial_def,
8805                                          &vec_initial_defs);
8806         }
8807       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8808                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8809         /* Fill the initial vector with the initial scalar value.  */
8810         vec_initial_def
8811           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8812                                            initial_def, initial_def);
8813       else
8814         {
8815           if (ncopies == 1)
8816             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8817           if (!reduc_info->reduc_initial_values.is_empty ())
8818             {
8819               initial_def = reduc_info->reduc_initial_values[0];
8820               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8821               tree neutral_op
8822                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8823                                             code, initial_def);
8824               gcc_assert (neutral_op);
8825               /* Try to simplify the vector initialization by applying an
8826                  adjustment after the reduction has been performed.  */
8827               if (!reduc_info->reused_accumulator
8828                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8829                   && !operand_equal_p (neutral_op, initial_def))
8830                 {
8831                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8832                     = initial_def;
8833                   initial_def = neutral_op;
8834                 }
8835               vec_initial_def
8836                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8837                                                  initial_def, neutral_op);
8838             }
8839         }
8840     }
8841
8842   if (vec_initial_def)
8843     {
8844       vec_initial_defs.create (ncopies);
8845       for (i = 0; i < ncopies; ++i)
8846         vec_initial_defs.quick_push (vec_initial_def);
8847     }
8848
8849   if (auto *accumulator = reduc_info->reused_accumulator)
8850     {
8851       tree def = accumulator->reduc_input;
8852       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8853         {
8854           unsigned int nreduc;
8855           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8856                                             (TREE_TYPE (def)),
8857                                           TYPE_VECTOR_SUBPARTS (vectype_out),
8858                                           &nreduc);
8859           gcc_assert (res);
8860           gimple_seq stmts = NULL;
8861           /* Reduce the single vector to a smaller one.  */
8862           if (nreduc != 1)
8863             {
8864               /* Perform the reduction in the appropriate type.  */
8865               tree rvectype = vectype_out;
8866               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8867                                               TREE_TYPE (TREE_TYPE (def))))
8868                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8869                                               TYPE_VECTOR_SUBPARTS
8870                                                 (vectype_out));
8871               def = vect_create_partial_epilog (def, rvectype,
8872                                                 STMT_VINFO_REDUC_CODE
8873                                                   (reduc_info),
8874                                                 &stmts);
8875             }
8876           /* The epilogue loop might use a different vector mode, like
8877              VNx2DI vs. V2DI.  */
8878           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8879             {
8880               tree reduc_type = build_vector_type_for_mode
8881                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8882               def = gimple_convert (&stmts, reduc_type, def);
8883             }
8884           /* Adjust the input so we pick up the partially reduced value
8885              for the skip edge in vect_create_epilog_for_reduction.  */
8886           accumulator->reduc_input = def;
8887           /* And the reduction could be carried out using a different sign.  */
8888           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8889             def = gimple_convert (&stmts, vectype_out, def);
8890           if (loop_vinfo->main_loop_edge)
8891             {
8892               /* While we'd like to insert on the edge this will split
8893                  blocks and disturb bookkeeping, we also will eventually
8894                  need this on the skip edge.  Rely on sinking to
8895                  fixup optimal placement and insert in the pred.  */
8896               gimple_stmt_iterator gsi
8897                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8898               /* Insert before a cond that eventually skips the
8899                  epilogue.  */
8900               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8901                 gsi_prev (&gsi);
8902               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8903             }
8904           else
8905             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8906                                               stmts);
8907         }
8908       if (loop_vinfo->main_loop_edge)
8909         vec_initial_defs[0]
8910           = vect_get_main_loop_result (loop_vinfo, def,
8911                                        vec_initial_defs[0]);
8912       else
8913         vec_initial_defs.safe_push (def);
8914     }
8915
8916   /* Generate the reduction PHIs upfront.  */
8917   for (i = 0; i < vec_num; i++)
8918     {
8919       tree vec_init_def = vec_initial_defs[i];
8920       for (j = 0; j < ncopies; j++)
8921         {
8922           /* Create the reduction-phi that defines the reduction
8923              operand.  */
8924           gphi *new_phi = create_phi_node (vec_dest, loop->header);
8925
8926           /* Set the loop-entry arg of the reduction-phi.  */
8927           if (j != 0 && nested_cycle)
8928             vec_init_def = vec_initial_defs[j];
8929           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8930                        UNKNOWN_LOCATION);
8931
8932           /* The loop-latch arg is set in epilogue processing.  */
8933
8934           if (slp_node)
8935             slp_node->push_vec_def (new_phi);
8936           else
8937             {
8938               if (j == 0)
8939                 *vec_stmt = new_phi;
8940               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8941             }
8942         }
8943     }
8944
8945   return true;
8946 }
8947
8948 /* Vectorizes LC PHIs.  */
8949
8950 bool
8951 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8952                      stmt_vec_info stmt_info, gimple **vec_stmt,
8953                      slp_tree slp_node)
8954 {
8955   if (!loop_vinfo
8956       || !is_a <gphi *> (stmt_info->stmt)
8957       || gimple_phi_num_args (stmt_info->stmt) != 1)
8958     return false;
8959
8960   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8961       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8962     return false;
8963
8964   if (!vec_stmt) /* transformation not required.  */
8965     {
8966       /* Deal with copies from externs or constants that disguise as
8967          loop-closed PHI nodes (PR97886).  */
8968       if (slp_node
8969           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8970                                                 SLP_TREE_VECTYPE (slp_node)))
8971         {
8972           if (dump_enabled_p ())
8973             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8974                              "incompatible vector types for invariants\n");
8975           return false;
8976         }
8977       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8978       return true;
8979     }
8980
8981   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8982   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8983   basic_block bb = gimple_bb (stmt_info->stmt);
8984   edge e = single_pred_edge (bb);
8985   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8986   auto_vec<tree> vec_oprnds;
8987   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8988                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8989                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8990   for (unsigned i = 0; i < vec_oprnds.length (); i++)
8991     {
8992       /* Create the vectorized LC PHI node.  */
8993       gphi *new_phi = create_phi_node (vec_dest, bb);
8994       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8995       if (slp_node)
8996         slp_node->push_vec_def (new_phi);
8997       else
8998         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8999     }
9000   if (!slp_node)
9001     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9002
9003   return true;
9004 }
9005
9006 /* Vectorizes PHIs.  */
9007
9008 bool
9009 vectorizable_phi (vec_info *,
9010                   stmt_vec_info stmt_info, gimple **vec_stmt,
9011                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9012 {
9013   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
9014     return false;
9015
9016   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
9017     return false;
9018
9019   tree vectype = SLP_TREE_VECTYPE (slp_node);
9020
9021   if (!vec_stmt) /* transformation not required.  */
9022     {
9023       slp_tree child;
9024       unsigned i;
9025       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
9026         if (!child)
9027           {
9028             if (dump_enabled_p ())
9029               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9030                                "PHI node with unvectorized backedge def\n");
9031             return false;
9032           }
9033         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
9034           {
9035             if (dump_enabled_p ())
9036               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9037                                "incompatible vector types for invariants\n");
9038             return false;
9039           }
9040         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9041                  && !useless_type_conversion_p (vectype,
9042                                                 SLP_TREE_VECTYPE (child)))
9043           {
9044             /* With bools we can have mask and non-mask precision vectors
9045                or different non-mask precisions.  while pattern recog is
9046                supposed to guarantee consistency here bugs in it can cause
9047                mismatches (PR103489 and PR103800 for example).
9048                Deal with them here instead of ICEing later.  */
9049             if (dump_enabled_p ())
9050               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9051                                "incompatible vector type setup from "
9052                                "bool pattern detection\n");
9053             return false;
9054           }
9055
9056       /* For single-argument PHIs assume coalescing which means zero cost
9057          for the scalar and the vector PHIs.  This avoids artificially
9058          favoring the vector path (but may pessimize it in some cases).  */
9059       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
9060         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9061                           vector_stmt, stmt_info, vectype, 0, vect_body);
9062       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
9063       return true;
9064     }
9065
9066   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9067   basic_block bb = gimple_bb (stmt_info->stmt);
9068   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9069   auto_vec<gphi *> new_phis;
9070   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
9071     {
9072       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9073
9074       /* Skip not yet vectorized defs.  */
9075       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9076           && SLP_TREE_VEC_DEFS (child).is_empty ())
9077         continue;
9078
9079       auto_vec<tree> vec_oprnds;
9080       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
9081       if (!new_phis.exists ())
9082         {
9083           new_phis.create (vec_oprnds.length ());
9084           for (unsigned j = 0; j < vec_oprnds.length (); j++)
9085             {
9086               /* Create the vectorized LC PHI node.  */
9087               new_phis.quick_push (create_phi_node (vec_dest, bb));
9088               slp_node->push_vec_def (new_phis[j]);
9089             }
9090         }
9091       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
9092       for (unsigned j = 0; j < vec_oprnds.length (); j++)
9093         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9094     }
9095   /* We should have at least one already vectorized child.  */
9096   gcc_assert (new_phis.exists ());
9097
9098   return true;
9099 }
9100
9101 /* Vectorizes first order recurrences.  An overview of the transformation
9102    is described below. Suppose we have the following loop.
9103
9104      int t = 0;
9105      for (int i = 0; i < n; ++i)
9106        {
9107          b[i] = a[i] - t;
9108          t = a[i];
9109        }
9110
9111    There is a first-order recurrence on 'a'. For this loop, the scalar IR
9112    looks (simplified) like:
9113
9114     scalar.preheader:
9115       init = 0;
9116
9117     scalar.body:
9118       i = PHI <0(scalar.preheader), i+1(scalar.body)>
9119       _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9120       _1 = a[i]
9121       b[i] = _1 - _2
9122       if (i < n) goto scalar.body
9123
9124    In this example, _2 is a recurrence because it's value depends on the
9125    previous iteration.  We vectorize this as (VF = 4)
9126
9127     vector.preheader:
9128       vect_init = vect_cst(..., ..., ..., 0)
9129
9130     vector.body
9131       i = PHI <0(vector.preheader), i+4(vector.body)>
9132       vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9133       vect_2 = a[i, i+1, i+2, i+3];
9134       vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9135       b[i, i+1, i+2, i+3] = vect_2 - vect_3
9136       if (..) goto vector.body
9137
9138    In this function, vectorizable_recurr, we code generate both the
9139    vector PHI node and the permute since those together compute the
9140    vectorized value of the scalar PHI.  We do not yet have the
9141    backedge value to fill in there nor into the vec_perm.  Those
9142    are filled in maybe_set_vectorized_backedge_value and
9143    vect_schedule_scc.
9144
9145    TODO:  Since the scalar loop does not have a use of the recurrence
9146    outside of the loop the natural way to implement peeling via
9147    vectorizing the live value doesn't work.  For now peeling of loops
9148    with a recurrence is not implemented.  For SLP the supported cases
9149    are restricted to those requiring a single vector recurrence PHI.  */
9150
9151 bool
9152 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9153                      gimple **vec_stmt, slp_tree slp_node,
9154                      stmt_vector_for_cost *cost_vec)
9155 {
9156   if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9157     return false;
9158
9159   gphi *phi = as_a<gphi *> (stmt_info->stmt);
9160
9161   /* So far we only support first-order recurrence auto-vectorization.  */
9162   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9163     return false;
9164
9165   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9166   unsigned ncopies;
9167   if (slp_node)
9168     ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9169   else
9170     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9171   poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9172   unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9173   /* We need to be able to make progress with a single vector.  */
9174   if (maybe_gt (dist * 2, nunits))
9175     {
9176       if (dump_enabled_p ())
9177         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9178                          "first order recurrence exceeds half of "
9179                          "a vector\n");
9180       return false;
9181     }
9182
9183   /* First-order recurrence autovectorization needs to handle permutation
9184      with indices = [nunits-1, nunits, nunits+1, ...].  */
9185   vec_perm_builder sel (nunits, 1, 3);
9186   for (int i = 0; i < 3; ++i)
9187     sel.quick_push (nunits - dist + i);
9188   vec_perm_indices indices (sel, 2, nunits);
9189
9190   if (!vec_stmt) /* transformation not required.  */
9191     {
9192       if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9193                                  indices))
9194         return false;
9195
9196       if (slp_node)
9197         {
9198           /* We eventually need to set a vector type on invariant
9199              arguments.  */
9200           unsigned j;
9201           slp_tree child;
9202           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9203             if (!vect_maybe_update_slp_op_vectype
9204                   (child, SLP_TREE_VECTYPE (slp_node)))
9205               {
9206                 if (dump_enabled_p ())
9207                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9208                                    "incompatible vector types for "
9209                                    "invariants\n");
9210                 return false;
9211               }
9212         }
9213       /* The recurrence costs the initialization vector and one permute
9214          for each copy.  */
9215       unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9216                                                  stmt_info, 0, vect_prologue);
9217       unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9218                                                stmt_info, 0, vect_body);
9219       if (dump_enabled_p ())
9220         dump_printf_loc (MSG_NOTE, vect_location,
9221                          "vectorizable_recurr: inside_cost = %d, "
9222                          "prologue_cost = %d .\n", inside_cost,
9223                          prologue_cost);
9224
9225       STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9226       return true;
9227     }
9228
9229   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9230   basic_block bb = gimple_bb (phi);
9231   tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9232   if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9233     {
9234       gimple_seq stmts = NULL;
9235       preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9236       gsi_insert_seq_on_edge_immediate (pe, stmts);
9237     }
9238   tree vec_init = build_vector_from_val (vectype, preheader);
9239   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9240
9241   /* Create the vectorized first-order PHI node.  */
9242   tree vec_dest = vect_get_new_vect_var (vectype,
9243                                          vect_simple_var, "vec_recur_");
9244   gphi *new_phi = create_phi_node (vec_dest, bb);
9245   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9246
9247   /* Insert shuffles the first-order recurrence autovectorization.
9248        result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>.  */
9249   tree perm = vect_gen_perm_mask_checked (vectype, indices);
9250
9251   /* Insert the required permute after the latch definition.  The
9252      second and later operands are tentative and will be updated when we have
9253      vectorized the latch definition.  */
9254   edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9255   gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9256   gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9257   gsi_next (&gsi2);
9258
9259   for (unsigned i = 0; i < ncopies; ++i)
9260     {
9261       vec_dest = make_ssa_name (vectype);
9262       gassign *vperm
9263           = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9264                                  i == 0 ? gimple_phi_result (new_phi) : NULL,
9265                                  NULL, perm);
9266       vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9267
9268       if (slp_node)
9269         slp_node->push_vec_def (vperm);
9270       else
9271         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9272     }
9273
9274   if (!slp_node)
9275     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9276   return true;
9277 }
9278
9279 /* Return true if VECTYPE represents a vector that requires lowering
9280    by the vector lowering pass.  */
9281
9282 bool
9283 vect_emulated_vector_p (tree vectype)
9284 {
9285   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9286           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9287               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9288 }
9289
9290 /* Return true if we can emulate CODE on an integer mode representation
9291    of a vector.  */
9292
9293 bool
9294 vect_can_vectorize_without_simd_p (tree_code code)
9295 {
9296   switch (code)
9297     {
9298     case PLUS_EXPR:
9299     case MINUS_EXPR:
9300     case NEGATE_EXPR:
9301     case BIT_AND_EXPR:
9302     case BIT_IOR_EXPR:
9303     case BIT_XOR_EXPR:
9304     case BIT_NOT_EXPR:
9305       return true;
9306
9307     default:
9308       return false;
9309     }
9310 }
9311
9312 /* Likewise, but taking a code_helper.  */
9313
9314 bool
9315 vect_can_vectorize_without_simd_p (code_helper code)
9316 {
9317   return (code.is_tree_code ()
9318           && vect_can_vectorize_without_simd_p (tree_code (code)));
9319 }
9320
9321 /* Create vector init for vectorized iv.  */
9322 static tree
9323 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9324                                tree step_expr, poly_uint64 nunits,
9325                                tree vectype,
9326                                enum vect_induction_op_type induction_type)
9327 {
9328   unsigned HOST_WIDE_INT const_nunits;
9329   tree vec_shift, vec_init, new_name;
9330   unsigned i;
9331   tree itype = TREE_TYPE (vectype);
9332
9333   /* iv_loop is the loop to be vectorized. Create:
9334      vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr).  */
9335   new_name = gimple_convert (stmts, itype, init_expr);
9336   switch (induction_type)
9337     {
9338     case vect_step_op_shr:
9339     case vect_step_op_shl:
9340       /* Build the Initial value from shift_expr.  */
9341       vec_init = gimple_build_vector_from_val (stmts,
9342                                                vectype,
9343                                                new_name);
9344       vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9345                                 build_zero_cst (itype), step_expr);
9346       vec_init = gimple_build (stmts,
9347                                (induction_type == vect_step_op_shr
9348                                 ? RSHIFT_EXPR : LSHIFT_EXPR),
9349                                vectype, vec_init, vec_shift);
9350       break;
9351
9352     case vect_step_op_neg:
9353       {
9354         vec_init = gimple_build_vector_from_val (stmts,
9355                                                  vectype,
9356                                                  new_name);
9357         tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9358                                      vectype, vec_init);
9359         /* The encoding has 2 interleaved stepped patterns.  */
9360         vec_perm_builder sel (nunits, 2, 3);
9361         sel.quick_grow (6);
9362         for (i = 0; i < 3; i++)
9363           {
9364             sel[2 * i] = i;
9365             sel[2 * i + 1] = i + nunits;
9366           }
9367         vec_perm_indices indices (sel, 2, nunits);
9368         /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9369            fail when vec_init is const vector. In that situation vec_perm is not
9370            really needed.  */
9371         tree perm_mask_even
9372           = vect_gen_perm_mask_any (vectype, indices);
9373         vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9374                                  vectype,
9375                                  vec_init, vec_neg,
9376                                  perm_mask_even);
9377       }
9378       break;
9379
9380     case vect_step_op_mul:
9381       {
9382         /* Use unsigned mult to avoid UD integer overflow.  */
9383         gcc_assert (nunits.is_constant (&const_nunits));
9384         tree utype = unsigned_type_for (itype);
9385         tree uvectype = build_vector_type (utype,
9386                                            TYPE_VECTOR_SUBPARTS (vectype));
9387         new_name = gimple_convert (stmts, utype, new_name);
9388         vec_init = gimple_build_vector_from_val (stmts,
9389                                                  uvectype,
9390                                                  new_name);
9391         tree_vector_builder elts (uvectype, const_nunits, 1);
9392         tree elt_step = build_one_cst (utype);
9393
9394         elts.quick_push (elt_step);
9395         for (i = 1; i < const_nunits; i++)
9396           {
9397             /* Create: new_name_i = new_name + step_expr.  */
9398             elt_step = gimple_build (stmts, MULT_EXPR,
9399                                      utype, elt_step, step_expr);
9400             elts.quick_push (elt_step);
9401           }
9402         /* Create a vector from [new_name_0, new_name_1, ...,
9403            new_name_nunits-1].  */
9404         tree vec_mul = gimple_build_vector (stmts, &elts);
9405         vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9406                                  vec_init, vec_mul);
9407         vec_init = gimple_convert (stmts, vectype, vec_init);
9408       }
9409       break;
9410
9411     default:
9412       gcc_unreachable ();
9413     }
9414
9415   return vec_init;
9416 }
9417
9418 /* Peel init_expr by skip_niter for induction_type.  */
9419 tree
9420 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9421                              tree skip_niters, tree step_expr,
9422                              enum vect_induction_op_type induction_type)
9423 {
9424   gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9425   tree type = TREE_TYPE (init_expr);
9426   unsigned prec = TYPE_PRECISION (type);
9427   switch (induction_type)
9428     {
9429     case vect_step_op_neg:
9430       if (TREE_INT_CST_LOW (skip_niters) % 2)
9431         init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9432       /* else no change.  */
9433       break;
9434
9435     case vect_step_op_shr:
9436     case vect_step_op_shl:
9437       skip_niters = gimple_convert (stmts, type, skip_niters);
9438       step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9439       /* When shift mount >= precision, need to avoid UD.
9440          In the original loop, there's no UD, and according to semantic,
9441          init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr.  */
9442       if (!tree_fits_uhwi_p (step_expr)
9443           || tree_to_uhwi (step_expr) >= prec)
9444         {
9445           if (induction_type == vect_step_op_shl
9446               || TYPE_UNSIGNED (type))
9447             init_expr = build_zero_cst (type);
9448           else
9449             init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9450                                       init_expr,
9451                                       wide_int_to_tree (type, prec - 1));
9452         }
9453       else
9454         init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9455                                           ? RSHIFT_EXPR : LSHIFT_EXPR),
9456                                   type, init_expr, step_expr);
9457       break;
9458
9459     case vect_step_op_mul:
9460       {
9461         tree utype = unsigned_type_for (type);
9462         init_expr = gimple_convert (stmts, utype, init_expr);
9463         wide_int skipn = wi::to_wide (skip_niters);
9464         wide_int begin = wi::to_wide (step_expr);
9465         auto_mpz base, exp, mod, res;
9466         wi::to_mpz (begin, base, TYPE_SIGN (type));
9467         wi::to_mpz (skipn, exp, UNSIGNED);
9468         mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9469         mpz_powm (res, base, exp, mod);
9470         begin = wi::from_mpz (type, res, TYPE_SIGN (type));
9471         tree mult_expr = wide_int_to_tree (utype, begin);
9472         init_expr = gimple_build (stmts, MULT_EXPR, utype,
9473                                   init_expr, mult_expr);
9474         init_expr = gimple_convert (stmts, type, init_expr);
9475       }
9476       break;
9477
9478     default:
9479       gcc_unreachable ();
9480     }
9481
9482   return init_expr;
9483 }
9484
9485 /* Create vector step for vectorized iv.  */
9486 static tree
9487 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9488                                poly_uint64 vf,
9489                                enum vect_induction_op_type induction_type)
9490 {
9491   tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9492   tree new_name = NULL;
9493   /* Step should be pow (step, vf) for mult induction.  */
9494   if (induction_type == vect_step_op_mul)
9495     {
9496       gcc_assert (vf.is_constant ());
9497       wide_int begin = wi::to_wide (step_expr);
9498
9499       for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9500         begin = wi::mul (begin, wi::to_wide (step_expr));
9501
9502       new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9503     }
9504   else if (induction_type == vect_step_op_neg)
9505     /* Do nothing.  */
9506     ;
9507   else
9508     new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9509                              expr, step_expr);
9510   return new_name;
9511 }
9512
9513 static tree
9514 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9515                                    stmt_vec_info stmt_info,
9516                                    tree new_name, tree vectype,
9517                                    enum vect_induction_op_type induction_type)
9518 {
9519   /* No step is needed for neg induction.  */
9520   if (induction_type == vect_step_op_neg)
9521     return NULL;
9522
9523   tree t = unshare_expr (new_name);
9524   gcc_assert (CONSTANT_CLASS_P (new_name)
9525               || TREE_CODE (new_name) == SSA_NAME);
9526   tree new_vec = build_vector_from_val (vectype, t);
9527   tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9528                                     new_vec, vectype, NULL);
9529   return vec_step;
9530 }
9531
9532 /* Update vectorized iv with vect_step, induc_def is init.  */
9533 static tree
9534 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9535                           tree induc_def, tree vec_step,
9536                           enum vect_induction_op_type induction_type)
9537 {
9538   tree vec_def = induc_def;
9539   switch (induction_type)
9540     {
9541     case vect_step_op_mul:
9542       {
9543         /* Use unsigned mult to avoid UD integer overflow.  */
9544         tree uvectype
9545           = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9546                                TYPE_VECTOR_SUBPARTS (vectype));
9547         vec_def = gimple_convert (stmts, uvectype, vec_def);
9548         vec_step = gimple_convert (stmts, uvectype, vec_step);
9549         vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9550                                 vec_def, vec_step);
9551         vec_def = gimple_convert (stmts, vectype, vec_def);
9552       }
9553       break;
9554
9555     case vect_step_op_shr:
9556       vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9557                               vec_def, vec_step);
9558       break;
9559
9560     case vect_step_op_shl:
9561       vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9562                               vec_def, vec_step);
9563       break;
9564     case vect_step_op_neg:
9565       vec_def = induc_def;
9566       /* Do nothing.  */
9567       break;
9568     default:
9569       gcc_unreachable ();
9570     }
9571
9572   return vec_def;
9573
9574 }
9575
9576 /* Function vectorizable_induction
9577
9578    Check if STMT_INFO performs an nonlinear induction computation that can be
9579    vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9580    a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9581    basic block.
9582    Return true if STMT_INFO is vectorizable in this way.  */
9583
9584 static bool
9585 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9586                                   stmt_vec_info stmt_info,
9587                                   gimple **vec_stmt, slp_tree slp_node,
9588                                   stmt_vector_for_cost *cost_vec)
9589 {
9590   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9591   unsigned ncopies;
9592   bool nested_in_vect_loop = false;
9593   class loop *iv_loop;
9594   tree vec_def;
9595   edge pe = loop_preheader_edge (loop);
9596   basic_block new_bb;
9597   tree vec_init, vec_step;
9598   tree new_name;
9599   gimple *new_stmt;
9600   gphi *induction_phi;
9601   tree induc_def, vec_dest;
9602   tree init_expr, step_expr;
9603   tree niters_skip;
9604   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9605   unsigned i;
9606   gimple_stmt_iterator si;
9607
9608   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9609
9610   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9611   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9612   enum vect_induction_op_type induction_type
9613     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9614
9615   gcc_assert (induction_type > vect_step_op_add);
9616
9617   if (slp_node)
9618     ncopies = 1;
9619   else
9620     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9621   gcc_assert (ncopies >= 1);
9622
9623   /* FORNOW. Only handle nonlinear induction in the same loop.  */
9624   if (nested_in_vect_loop_p (loop, stmt_info))
9625     {
9626       if (dump_enabled_p ())
9627         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9628                          "nonlinear induction in nested loop.\n");
9629       return false;
9630     }
9631
9632   iv_loop = loop;
9633   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9634
9635   /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9636      update for each iv and a permutation to generate wanted vector iv.  */
9637   if (slp_node)
9638     {
9639       if (dump_enabled_p ())
9640         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9641                          "SLP induction not supported for nonlinear"
9642                          " induction.\n");
9643       return false;
9644     }
9645
9646   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9647     {
9648       if (dump_enabled_p ())
9649         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9650                          "floating point nonlinear induction vectorization"
9651                          " not supported.\n");
9652       return false;
9653     }
9654
9655   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9656   init_expr = vect_phi_initial_value (phi);
9657   gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9658               && TREE_CODE (step_expr) == INTEGER_CST);
9659   /* step_expr should be aligned with init_expr,
9660      .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used.  */
9661   step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9662
9663   if (TREE_CODE (init_expr) == INTEGER_CST)
9664     init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9665   else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9666     {
9667       /* INIT_EXPR could be a bit_field, bail out for such case.  */
9668       if (dump_enabled_p ())
9669         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9670                          "nonlinear induction vectorization failed:"
9671                          " component type of vectype is not a nop conversion"
9672                          " from type of init_expr.\n");
9673       return false;
9674     }
9675
9676   switch (induction_type)
9677     {
9678     case vect_step_op_neg:
9679       if (TREE_CODE (init_expr) != INTEGER_CST
9680           && TREE_CODE (init_expr) != REAL_CST)
9681         {
9682           /* Check for backend support of NEGATE_EXPR and vec_perm.  */
9683           if (!directly_supported_p (NEGATE_EXPR, vectype))
9684             return false;
9685
9686           /* The encoding has 2 interleaved stepped patterns.  */
9687           vec_perm_builder sel (nunits, 2, 3);
9688           machine_mode mode = TYPE_MODE (vectype);
9689           sel.quick_grow (6);
9690           for (i = 0; i < 3; i++)
9691             {
9692               sel[i * 2] = i;
9693               sel[i * 2 + 1] = i + nunits;
9694             }
9695           vec_perm_indices indices (sel, 2, nunits);
9696           if (!can_vec_perm_const_p (mode, mode, indices))
9697             return false;
9698         }
9699       break;
9700
9701     case vect_step_op_mul:
9702       {
9703         /* Check for backend support of MULT_EXPR.  */
9704         if (!directly_supported_p (MULT_EXPR, vectype))
9705           return false;
9706
9707         /* ?? How to construct vector step for variable number vector.
9708            [ 1, step, pow (step, 2), pow (step, 4), .. ].  */
9709         if (!vf.is_constant ())
9710           return false;
9711       }
9712       break;
9713
9714     case vect_step_op_shr:
9715       /* Check for backend support of RSHIFT_EXPR.  */
9716       if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9717         return false;
9718
9719       /* Don't shift more than type precision to avoid UD.  */
9720       if (!tree_fits_uhwi_p (step_expr)
9721           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9722                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9723         return false;
9724       break;
9725
9726     case vect_step_op_shl:
9727       /* Check for backend support of RSHIFT_EXPR.  */
9728       if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9729         return false;
9730
9731       /* Don't shift more than type precision to avoid UD.  */
9732       if (!tree_fits_uhwi_p (step_expr)
9733           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9734                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9735         return false;
9736
9737       break;
9738
9739     default:
9740       gcc_unreachable ();
9741     }
9742
9743   if (!vec_stmt) /* transformation not required.  */
9744     {
9745       unsigned inside_cost = 0, prologue_cost = 0;
9746       /* loop cost for vec_loop. Neg induction doesn't have any
9747          inside_cost.  */
9748       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9749                                       stmt_info, 0, vect_body);
9750
9751       /* loop cost for vec_loop. Neg induction doesn't have any
9752          inside_cost.  */
9753       if (induction_type == vect_step_op_neg)
9754         inside_cost = 0;
9755
9756       /* prologue cost for vec_init and vec_step.  */
9757       prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9758                                         stmt_info, 0, vect_prologue);
9759
9760       if (dump_enabled_p ())
9761         dump_printf_loc (MSG_NOTE, vect_location,
9762                          "vect_model_induction_cost: inside_cost = %d, "
9763                          "prologue_cost = %d. \n", inside_cost,
9764                          prologue_cost);
9765
9766       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9767       DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9768       return true;
9769     }
9770
9771   /* Transform.  */
9772
9773   /* Compute a vector variable, initialized with the first VF values of
9774      the induction variable.  E.g., for an iv with IV_PHI='X' and
9775      evolution S, for a vector of 4 units, we want to compute:
9776      [X, X + S, X + 2*S, X + 3*S].  */
9777
9778   if (dump_enabled_p ())
9779     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9780
9781   pe = loop_preheader_edge (iv_loop);
9782   /* Find the first insertion point in the BB.  */
9783   basic_block bb = gimple_bb (phi);
9784   si = gsi_after_labels (bb);
9785
9786   gimple_seq stmts = NULL;
9787
9788   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9789   /* If we are using the loop mask to "peel" for alignment then we need
9790      to adjust the start value here.  */
9791   if (niters_skip != NULL_TREE)
9792     init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9793                                              step_expr, induction_type);
9794
9795   vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9796                                             step_expr, nunits, vectype,
9797                                             induction_type);
9798   if (stmts)
9799     {
9800       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9801       gcc_assert (!new_bb);
9802     }
9803
9804   stmts = NULL;
9805   new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9806                                             vf, induction_type);
9807   if (stmts)
9808     {
9809       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9810       gcc_assert (!new_bb);
9811     }
9812
9813   vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9814                                                 new_name, vectype,
9815                                                 induction_type);
9816   /* Create the following def-use cycle:
9817      loop prolog:
9818      vec_init = ...
9819      vec_step = ...
9820      loop:
9821      vec_iv = PHI <vec_init, vec_loop>
9822      ...
9823      STMT
9824      ...
9825      vec_loop = vec_iv + vec_step;  */
9826
9827   /* Create the induction-phi that defines the induction-operand.  */
9828   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9829   induction_phi = create_phi_node (vec_dest, iv_loop->header);
9830   induc_def = PHI_RESULT (induction_phi);
9831
9832   /* Create the iv update inside the loop.  */
9833   stmts = NULL;
9834   vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9835                                       induc_def, vec_step,
9836                                       induction_type);
9837
9838   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9839   new_stmt = SSA_NAME_DEF_STMT (vec_def);
9840
9841   /* Set the arguments of the phi node:  */
9842   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9843   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9844                UNKNOWN_LOCATION);
9845
9846   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9847   *vec_stmt = induction_phi;
9848
9849   /* In case that vectorization factor (VF) is bigger than the number
9850      of elements that we can fit in a vectype (nunits), we have to generate
9851      more than one vector stmt - i.e - we need to "unroll" the
9852      vector stmt by a factor VF/nunits.  For more details see documentation
9853      in vectorizable_operation.  */
9854
9855   if (ncopies > 1)
9856     {
9857       stmts = NULL;
9858       /* FORNOW. This restriction should be relaxed.  */
9859       gcc_assert (!nested_in_vect_loop);
9860
9861       new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9862                                                 nunits, induction_type);
9863
9864       vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9865                                                     new_name, vectype,
9866                                                     induction_type);
9867       vec_def = induc_def;
9868       for (i = 1; i < ncopies; i++)
9869         {
9870           /* vec_i = vec_prev + vec_step.  */
9871           stmts = NULL;
9872           vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9873                                               vec_def, vec_step,
9874                                               induction_type);
9875           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9876           new_stmt = SSA_NAME_DEF_STMT (vec_def);
9877           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9878         }
9879     }
9880
9881   if (dump_enabled_p ())
9882     dump_printf_loc (MSG_NOTE, vect_location,
9883                      "transform induction: created def-use cycle: %G%G",
9884                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9885
9886   return true;
9887 }
9888
9889 /* Function vectorizable_induction
9890
9891    Check if STMT_INFO performs an induction computation that can be vectorized.
9892    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9893    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9894    Return true if STMT_INFO is vectorizable in this way.  */
9895
9896 bool
9897 vectorizable_induction (loop_vec_info loop_vinfo,
9898                         stmt_vec_info stmt_info,
9899                         gimple **vec_stmt, slp_tree slp_node,
9900                         stmt_vector_for_cost *cost_vec)
9901 {
9902   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9903   unsigned ncopies;
9904   bool nested_in_vect_loop = false;
9905   class loop *iv_loop;
9906   tree vec_def;
9907   edge pe = loop_preheader_edge (loop);
9908   basic_block new_bb;
9909   tree new_vec, vec_init, vec_step, t;
9910   tree new_name;
9911   gimple *new_stmt;
9912   gphi *induction_phi;
9913   tree induc_def, vec_dest;
9914   tree init_expr, step_expr;
9915   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9916   unsigned i;
9917   tree expr;
9918   gimple_stmt_iterator si;
9919   enum vect_induction_op_type induction_type
9920     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9921
9922   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9923   if (!phi)
9924     return false;
9925
9926   if (!STMT_VINFO_RELEVANT_P (stmt_info))
9927     return false;
9928
9929   /* Make sure it was recognized as induction computation.  */
9930   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9931     return false;
9932
9933   /* Handle nonlinear induction in a separate place.  */
9934   if (induction_type != vect_step_op_add)
9935     return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9936                                              vec_stmt, slp_node, cost_vec);
9937
9938   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9939   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9940
9941   if (slp_node)
9942     ncopies = 1;
9943   else
9944     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9945   gcc_assert (ncopies >= 1);
9946
9947   /* FORNOW. These restrictions should be relaxed.  */
9948   if (nested_in_vect_loop_p (loop, stmt_info))
9949     {
9950       imm_use_iterator imm_iter;
9951       use_operand_p use_p;
9952       gimple *exit_phi;
9953       edge latch_e;
9954       tree loop_arg;
9955
9956       if (ncopies > 1)
9957         {
9958           if (dump_enabled_p ())
9959             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9960                              "multiple types in nested loop.\n");
9961           return false;
9962         }
9963
9964       exit_phi = NULL;
9965       latch_e = loop_latch_edge (loop->inner);
9966       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9967       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9968         {
9969           gimple *use_stmt = USE_STMT (use_p);
9970           if (is_gimple_debug (use_stmt))
9971             continue;
9972
9973           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9974             {
9975               exit_phi = use_stmt;
9976               break;
9977             }
9978         }
9979       if (exit_phi)
9980         {
9981           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9982           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9983                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9984             {
9985               if (dump_enabled_p ())
9986                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9987                                  "inner-loop induction only used outside "
9988                                  "of the outer vectorized loop.\n");
9989               return false;
9990             }
9991         }
9992
9993       nested_in_vect_loop = true;
9994       iv_loop = loop->inner;
9995     }
9996   else
9997     iv_loop = loop;
9998   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9999
10000   if (slp_node && !nunits.is_constant ())
10001     {
10002       /* The current SLP code creates the step value element-by-element.  */
10003       if (dump_enabled_p ())
10004         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10005                          "SLP induction not supported for variable-length"
10006                          " vectors.\n");
10007       return false;
10008     }
10009
10010   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
10011     {
10012       if (dump_enabled_p ())
10013         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10014                          "floating point induction vectorization disabled\n");
10015       return false;
10016     }
10017
10018   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
10019   gcc_assert (step_expr != NULL_TREE);
10020   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
10021
10022   /* Check for backend support of PLUS/MINUS_EXPR. */
10023   if (!directly_supported_p (PLUS_EXPR, step_vectype)
10024       || !directly_supported_p (MINUS_EXPR, step_vectype))
10025     return false;
10026
10027   if (!vec_stmt) /* transformation not required.  */
10028     {
10029       unsigned inside_cost = 0, prologue_cost = 0;
10030       if (slp_node)
10031         {
10032           /* We eventually need to set a vector type on invariant
10033              arguments.  */
10034           unsigned j;
10035           slp_tree child;
10036           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
10037             if (!vect_maybe_update_slp_op_vectype
10038                 (child, SLP_TREE_VECTYPE (slp_node)))
10039               {
10040                 if (dump_enabled_p ())
10041                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10042                                    "incompatible vector types for "
10043                                    "invariants\n");
10044                 return false;
10045               }
10046           /* loop cost for vec_loop.  */
10047           inside_cost
10048             = record_stmt_cost (cost_vec,
10049                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
10050                                 vector_stmt, stmt_info, 0, vect_body);
10051           /* prologue cost for vec_init (if not nested) and step.  */
10052           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
10053                                             scalar_to_vec,
10054                                             stmt_info, 0, vect_prologue);
10055         }
10056       else /* if (!slp_node) */
10057         {
10058           /* loop cost for vec_loop.  */
10059           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
10060                                           stmt_info, 0, vect_body);
10061           /* prologue cost for vec_init and vec_step.  */
10062           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
10063                                             stmt_info, 0, vect_prologue);
10064         }
10065       if (dump_enabled_p ())
10066         dump_printf_loc (MSG_NOTE, vect_location,
10067                          "vect_model_induction_cost: inside_cost = %d, "
10068                          "prologue_cost = %d .\n", inside_cost,
10069                          prologue_cost);
10070
10071       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10072       DUMP_VECT_SCOPE ("vectorizable_induction");
10073       return true;
10074     }
10075
10076   /* Transform.  */
10077
10078   /* Compute a vector variable, initialized with the first VF values of
10079      the induction variable.  E.g., for an iv with IV_PHI='X' and
10080      evolution S, for a vector of 4 units, we want to compute:
10081      [X, X + S, X + 2*S, X + 3*S].  */
10082
10083   if (dump_enabled_p ())
10084     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10085
10086   pe = loop_preheader_edge (iv_loop);
10087   /* Find the first insertion point in the BB.  */
10088   basic_block bb = gimple_bb (phi);
10089   si = gsi_after_labels (bb);
10090
10091   /* For SLP induction we have to generate several IVs as for example
10092      with group size 3 we need
10093        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10094        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
10095   if (slp_node)
10096     {
10097       /* Enforced above.  */
10098       unsigned int const_nunits = nunits.to_constant ();
10099
10100       /* The initial values are vectorized, but any lanes > group_size
10101          need adjustment.  */
10102       slp_tree init_node
10103         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10104
10105       /* Gather steps.  Since we do not vectorize inductions as
10106          cycles we have to reconstruct the step from SCEV data.  */
10107       unsigned group_size = SLP_TREE_LANES (slp_node);
10108       tree *steps = XALLOCAVEC (tree, group_size);
10109       tree *inits = XALLOCAVEC (tree, group_size);
10110       stmt_vec_info phi_info;
10111       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10112         {
10113           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10114           if (!init_node)
10115             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
10116                                            pe->dest_idx);
10117         }
10118
10119       /* Now generate the IVs.  */
10120       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10121       gcc_assert ((const_nunits * nvects) % group_size == 0);
10122       unsigned nivs;
10123       if (nested_in_vect_loop)
10124         nivs = nvects;
10125       else
10126         {
10127           /* Compute the number of distinct IVs we need.  First reduce
10128              group_size if it is a multiple of const_nunits so we get
10129              one IV for a group_size of 4 but const_nunits 2.  */
10130           unsigned group_sizep = group_size;
10131           if (group_sizep % const_nunits == 0)
10132             group_sizep = group_sizep / const_nunits;
10133           nivs = least_common_multiple (group_sizep,
10134                                         const_nunits) / const_nunits;
10135         }
10136       tree stept = TREE_TYPE (step_vectype);
10137       tree lupdate_mul = NULL_TREE;
10138       if (!nested_in_vect_loop)
10139         {
10140           /* The number of iterations covered in one vector iteration.  */
10141           unsigned lup_mul = (nvects * const_nunits) / group_size;
10142           lupdate_mul
10143             = build_vector_from_val (step_vectype,
10144                                      SCALAR_FLOAT_TYPE_P (stept)
10145                                      ? build_real_from_wide (stept, lup_mul,
10146                                                              UNSIGNED)
10147                                      : build_int_cstu (stept, lup_mul));
10148         }
10149       tree peel_mul = NULL_TREE;
10150       gimple_seq init_stmts = NULL;
10151       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10152         {
10153           if (SCALAR_FLOAT_TYPE_P (stept))
10154             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10155                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10156           else
10157             peel_mul = gimple_convert (&init_stmts, stept,
10158                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10159           peel_mul = gimple_build_vector_from_val (&init_stmts,
10160                                                    step_vectype, peel_mul);
10161         }
10162       unsigned ivn;
10163       auto_vec<tree> vec_steps;
10164       for (ivn = 0; ivn < nivs; ++ivn)
10165         {
10166           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10167           tree_vector_builder init_elts (vectype, const_nunits, 1);
10168           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10169           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10170             {
10171               /* The scalar steps of the IVs.  */
10172               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10173               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10174               step_elts.quick_push (elt);
10175               if (!init_node)
10176                 {
10177                   /* The scalar inits of the IVs if not vectorized.  */
10178                   elt = inits[(ivn*const_nunits + eltn) % group_size];
10179                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
10180                                                   TREE_TYPE (elt)))
10181                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10182                                         TREE_TYPE (vectype), elt);
10183                   init_elts.quick_push (elt);
10184                 }
10185               /* The number of steps to add to the initial values.  */
10186               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10187               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10188                                    ? build_real_from_wide (stept,
10189                                                            mul_elt, UNSIGNED)
10190                                    : build_int_cstu (stept, mul_elt));
10191             }
10192           vec_step = gimple_build_vector (&init_stmts, &step_elts);
10193           vec_steps.safe_push (vec_step);
10194           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10195           if (peel_mul)
10196             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10197                                      step_mul, peel_mul);
10198           if (!init_node)
10199             vec_init = gimple_build_vector (&init_stmts, &init_elts);
10200
10201           /* Create the induction-phi that defines the induction-operand.  */
10202           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10203                                             "vec_iv_");
10204           induction_phi = create_phi_node (vec_dest, iv_loop->header);
10205           induc_def = PHI_RESULT (induction_phi);
10206
10207           /* Create the iv update inside the loop  */
10208           tree up = vec_step;
10209           if (lupdate_mul)
10210             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10211                                vec_step, lupdate_mul);
10212           gimple_seq stmts = NULL;
10213           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10214           vec_def = gimple_build (&stmts,
10215                                   PLUS_EXPR, step_vectype, vec_def, up);
10216           vec_def = gimple_convert (&stmts, vectype, vec_def);
10217           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10218           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10219                        UNKNOWN_LOCATION);
10220
10221           if (init_node)
10222             vec_init = vect_get_slp_vect_def (init_node, ivn);
10223           if (!nested_in_vect_loop
10224               && !integer_zerop (step_mul))
10225             {
10226               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10227               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10228                                  vec_step, step_mul);
10229               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10230                                       vec_def, up);
10231               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10232             }
10233
10234           /* Set the arguments of the phi node:  */
10235           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10236
10237           slp_node->push_vec_def (induction_phi);
10238         }
10239       if (!nested_in_vect_loop)
10240         {
10241           /* Fill up to the number of vectors we need for the whole group.  */
10242           nivs = least_common_multiple (group_size,
10243                                         const_nunits) / const_nunits;
10244           vec_steps.reserve (nivs-ivn);
10245           for (; ivn < nivs; ++ivn)
10246             {
10247               slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10248               vec_steps.quick_push (vec_steps[0]);
10249             }
10250         }
10251
10252       /* Re-use IVs when we can.  We are generating further vector
10253          stmts by adding VF' * stride to the IVs generated above.  */
10254       if (ivn < nvects)
10255         {
10256           unsigned vfp
10257             = least_common_multiple (group_size, const_nunits) / group_size;
10258           tree lupdate_mul
10259             = build_vector_from_val (step_vectype,
10260                                      SCALAR_FLOAT_TYPE_P (stept)
10261                                      ? build_real_from_wide (stept,
10262                                                              vfp, UNSIGNED)
10263                                      : build_int_cstu (stept, vfp));
10264           for (; ivn < nvects; ++ivn)
10265             {
10266               gimple *iv
10267                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10268               tree def = gimple_get_lhs (iv);
10269               if (ivn < 2*nivs)
10270                 vec_steps[ivn - nivs]
10271                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10272                                   vec_steps[ivn - nivs], lupdate_mul);
10273               gimple_seq stmts = NULL;
10274               def = gimple_convert (&stmts, step_vectype, def);
10275               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10276                                   def, vec_steps[ivn % nivs]);
10277               def = gimple_convert (&stmts, vectype, def);
10278               if (gimple_code (iv) == GIMPLE_PHI)
10279                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10280               else
10281                 {
10282                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10283                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10284                 }
10285               slp_node->push_vec_def (def);
10286             }
10287         }
10288
10289       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10290       gcc_assert (!new_bb);
10291
10292       return true;
10293     }
10294
10295   init_expr = vect_phi_initial_value (phi);
10296
10297   gimple_seq stmts = NULL;
10298   if (!nested_in_vect_loop)
10299     {
10300       /* Convert the initial value to the IV update type.  */
10301       tree new_type = TREE_TYPE (step_expr);
10302       init_expr = gimple_convert (&stmts, new_type, init_expr);
10303
10304       /* If we are using the loop mask to "peel" for alignment then we need
10305          to adjust the start value here.  */
10306       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10307       if (skip_niters != NULL_TREE)
10308         {
10309           if (FLOAT_TYPE_P (vectype))
10310             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10311                                         skip_niters);
10312           else
10313             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10314           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10315                                          skip_niters, step_expr);
10316           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10317                                     init_expr, skip_step);
10318         }
10319     }
10320
10321   if (stmts)
10322     {
10323       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10324       gcc_assert (!new_bb);
10325     }
10326
10327   /* Create the vector that holds the initial_value of the induction.  */
10328   if (nested_in_vect_loop)
10329     {
10330       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
10331          been created during vectorization of previous stmts.  We obtain it
10332          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
10333       auto_vec<tree> vec_inits;
10334       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10335                                      init_expr, &vec_inits);
10336       vec_init = vec_inits[0];
10337       /* If the initial value is not of proper type, convert it.  */
10338       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10339         {
10340           new_stmt
10341             = gimple_build_assign (vect_get_new_ssa_name (vectype,
10342                                                           vect_simple_var,
10343                                                           "vec_iv_"),
10344                                    VIEW_CONVERT_EXPR,
10345                                    build1 (VIEW_CONVERT_EXPR, vectype,
10346                                            vec_init));
10347           vec_init = gimple_assign_lhs (new_stmt);
10348           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10349                                                  new_stmt);
10350           gcc_assert (!new_bb);
10351         }
10352     }
10353   else
10354     {
10355       /* iv_loop is the loop to be vectorized. Create:
10356          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
10357       stmts = NULL;
10358       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10359
10360       unsigned HOST_WIDE_INT const_nunits;
10361       if (nunits.is_constant (&const_nunits))
10362         {
10363           tree_vector_builder elts (step_vectype, const_nunits, 1);
10364           elts.quick_push (new_name);
10365           for (i = 1; i < const_nunits; i++)
10366             {
10367               /* Create: new_name_i = new_name + step_expr  */
10368               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10369                                        new_name, step_expr);
10370               elts.quick_push (new_name);
10371             }
10372           /* Create a vector from [new_name_0, new_name_1, ...,
10373              new_name_nunits-1]  */
10374           vec_init = gimple_build_vector (&stmts, &elts);
10375         }
10376       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10377         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
10378         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10379                                  new_name, step_expr);
10380       else
10381         {
10382           /* Build:
10383                 [base, base, base, ...]
10384                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
10385           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10386           gcc_assert (flag_associative_math);
10387           tree index = build_index_vector (step_vectype, 0, 1);
10388           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10389                                                         new_name);
10390           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10391                                                         step_expr);
10392           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10393           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10394                                    vec_init, step_vec);
10395           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10396                                    vec_init, base_vec);
10397         }
10398       vec_init = gimple_convert (&stmts, vectype, vec_init);
10399
10400       if (stmts)
10401         {
10402           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10403           gcc_assert (!new_bb);
10404         }
10405     }
10406
10407
10408   /* Create the vector that holds the step of the induction.  */
10409   gimple_stmt_iterator *step_iv_si = NULL;
10410   if (nested_in_vect_loop)
10411     /* iv_loop is nested in the loop to be vectorized. Generate:
10412        vec_step = [S, S, S, S]  */
10413     new_name = step_expr;
10414   else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10415     {
10416       /* When we're using loop_len produced by SELEC_VL, the non-final
10417          iterations are not always processing VF elements.  So vectorize
10418          induction variable instead of
10419
10420            _21 = vect_vec_iv_.6_22 + { VF, ... };
10421
10422          We should generate:
10423
10424            _35 = .SELECT_VL (ivtmp_33, VF);
10425            vect_cst__22 = [vec_duplicate_expr] _35;
10426            _21 = vect_vec_iv_.6_22 + vect_cst__22;  */
10427       gcc_assert (!slp_node);
10428       gimple_seq seq = NULL;
10429       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10430       tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 0);
10431       expr = force_gimple_operand (fold_convert (TREE_TYPE (step_expr),
10432                                                  unshare_expr (len)),
10433                                    &seq, true, NULL_TREE);
10434       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), expr,
10435                                step_expr);
10436       gsi_insert_seq_before (&si, seq, GSI_SAME_STMT);
10437       step_iv_si = &si;
10438     }
10439   else
10440     {
10441       /* iv_loop is the loop to be vectorized. Generate:
10442           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
10443       gimple_seq seq = NULL;
10444       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10445         {
10446           expr = build_int_cst (integer_type_node, vf);
10447           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10448         }
10449       else
10450         expr = build_int_cst (TREE_TYPE (step_expr), vf);
10451       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10452                                expr, step_expr);
10453       if (seq)
10454         {
10455           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10456           gcc_assert (!new_bb);
10457         }
10458     }
10459
10460   t = unshare_expr (new_name);
10461   gcc_assert (CONSTANT_CLASS_P (new_name)
10462               || TREE_CODE (new_name) == SSA_NAME);
10463   new_vec = build_vector_from_val (step_vectype, t);
10464   vec_step = vect_init_vector (loop_vinfo, stmt_info,
10465                                new_vec, step_vectype, step_iv_si);
10466
10467
10468   /* Create the following def-use cycle:
10469      loop prolog:
10470          vec_init = ...
10471          vec_step = ...
10472      loop:
10473          vec_iv = PHI <vec_init, vec_loop>
10474          ...
10475          STMT
10476          ...
10477          vec_loop = vec_iv + vec_step;  */
10478
10479   /* Create the induction-phi that defines the induction-operand.  */
10480   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10481   induction_phi = create_phi_node (vec_dest, iv_loop->header);
10482   induc_def = PHI_RESULT (induction_phi);
10483
10484   /* Create the iv update inside the loop  */
10485   stmts = NULL;
10486   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10487   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10488   vec_def = gimple_convert (&stmts, vectype, vec_def);
10489   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10490   new_stmt = SSA_NAME_DEF_STMT (vec_def);
10491
10492   /* Set the arguments of the phi node:  */
10493   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10494   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10495                UNKNOWN_LOCATION);
10496
10497   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10498   *vec_stmt = induction_phi;
10499
10500   /* In case that vectorization factor (VF) is bigger than the number
10501      of elements that we can fit in a vectype (nunits), we have to generate
10502      more than one vector stmt - i.e - we need to "unroll" the
10503      vector stmt by a factor VF/nunits.  For more details see documentation
10504      in vectorizable_operation.  */
10505
10506   if (ncopies > 1)
10507     {
10508       gimple_seq seq = NULL;
10509       /* FORNOW. This restriction should be relaxed.  */
10510       gcc_assert (!nested_in_vect_loop);
10511       /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1.  */
10512       gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10513
10514       /* Create the vector that holds the step of the induction.  */
10515       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10516         {
10517           expr = build_int_cst (integer_type_node, nunits);
10518           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10519         }
10520       else
10521         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10522       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10523                                expr, step_expr);
10524       if (seq)
10525         {
10526           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10527           gcc_assert (!new_bb);
10528         }
10529
10530       t = unshare_expr (new_name);
10531       gcc_assert (CONSTANT_CLASS_P (new_name)
10532                   || TREE_CODE (new_name) == SSA_NAME);
10533       new_vec = build_vector_from_val (step_vectype, t);
10534       vec_step = vect_init_vector (loop_vinfo, stmt_info,
10535                                    new_vec, step_vectype, NULL);
10536
10537       vec_def = induc_def;
10538       for (i = 1; i < ncopies + 1; i++)
10539         {
10540           /* vec_i = vec_prev + vec_step  */
10541           gimple_seq stmts = NULL;
10542           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10543           vec_def = gimple_build (&stmts,
10544                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
10545           vec_def = gimple_convert (&stmts, vectype, vec_def);
10546
10547           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10548           if (i < ncopies)
10549             {
10550               new_stmt = SSA_NAME_DEF_STMT (vec_def);
10551               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10552             }
10553           else
10554             {
10555               /* vec_1 = vec_iv + (VF/n * S)
10556                  vec_2 = vec_1 + (VF/n * S)
10557                  ...
10558                  vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10559
10560                  vec_n is used as vec_loop to save the large step register and
10561                  related operations.  */
10562               add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10563                            UNKNOWN_LOCATION);
10564             }
10565         }
10566     }
10567
10568   if (dump_enabled_p ())
10569     dump_printf_loc (MSG_NOTE, vect_location,
10570                      "transform induction: created def-use cycle: %G%G",
10571                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10572
10573   return true;
10574 }
10575
10576 /* Function vectorizable_live_operation_1.
10577
10578    helper function for vectorizable_live_operation.  */
10579
10580 tree
10581 vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
10582                                stmt_vec_info stmt_info, basic_block exit_bb,
10583                                tree vectype, int ncopies, slp_tree slp_node,
10584                                tree bitsize, tree bitstart, tree vec_lhs,
10585                                tree lhs_type, bool restart_loop,
10586                                gimple_stmt_iterator *exit_gsi)
10587 {
10588   gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10589
10590   tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10591   gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10592   for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10593     SET_PHI_ARG_DEF (phi, i, vec_lhs);
10594
10595   gimple_seq stmts = NULL;
10596   tree new_tree;
10597   if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10598     {
10599       /* Emit:
10600
10601          SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10602
10603          where VEC_LHS is the vectorized live-out result and MASK is
10604          the loop mask for the final iteration.  */
10605       gcc_assert (ncopies == 1 && !slp_node);
10606       gimple_seq tem = NULL;
10607       gimple_stmt_iterator gsi = gsi_last (tem);
10608       tree len = vect_get_loop_len (loop_vinfo, &gsi,
10609                                     &LOOP_VINFO_LENS (loop_vinfo),
10610                                     1, vectype, 0, 0);
10611
10612       /* BIAS - 1.  */
10613       signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10614       tree bias_minus_one
10615         = int_const_binop (MINUS_EXPR,
10616                            build_int_cst (TREE_TYPE (len), biasval),
10617                            build_one_cst (TREE_TYPE (len)));
10618
10619       /* LAST_INDEX = LEN + (BIAS - 1).  */
10620       tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10621                                      len, bias_minus_one);
10622
10623       /* This needs to implement extraction of the first index, but not sure
10624          how the LEN stuff works.  At the moment we shouldn't get here since
10625          there's no LEN support for early breaks.  But guard this so there's
10626          no incorrect codegen.  */
10627       gcc_assert (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10628
10629       /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
10630       tree scalar_res
10631         = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10632                         vec_lhs_phi, last_index);
10633
10634       /* Convert the extracted vector element to the scalar type.  */
10635       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10636     }
10637   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10638     {
10639       /* Emit:
10640
10641          SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10642
10643          where VEC_LHS is the vectorized live-out result and MASK is
10644          the loop mask for the final iteration.  */
10645       gcc_assert (!slp_node);
10646       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10647       gimple_seq tem = NULL;
10648       gimple_stmt_iterator gsi = gsi_last (tem);
10649       tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10650                                       &LOOP_VINFO_MASKS (loop_vinfo),
10651                                       1, vectype, 0);
10652       tree scalar_res;
10653
10654       /* For an inverted control flow with early breaks we want EXTRACT_FIRST
10655          instead of EXTRACT_LAST.  Emulate by reversing the vector and mask. */
10656       if (restart_loop && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10657         {
10658           /* First create the permuted mask.  */
10659           tree perm_mask = perm_mask_for_reverse (TREE_TYPE (mask));
10660           tree perm_dest = copy_ssa_name (mask);
10661           gimple *perm_stmt
10662                 = gimple_build_assign (perm_dest, VEC_PERM_EXPR, mask,
10663                                        mask, perm_mask);
10664           vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
10665                                        &gsi);
10666           mask = perm_dest;
10667
10668           /* Then permute the vector contents.  */
10669           tree perm_elem = perm_mask_for_reverse (vectype);
10670           perm_dest = copy_ssa_name (vec_lhs_phi);
10671           perm_stmt
10672                 = gimple_build_assign (perm_dest, VEC_PERM_EXPR, vec_lhs_phi,
10673                                        vec_lhs_phi, perm_elem);
10674           vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
10675                                        &gsi);
10676           vec_lhs_phi = perm_dest;
10677         }
10678
10679       gimple_seq_add_seq (&stmts, tem);
10680
10681       scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10682                                  mask, vec_lhs_phi);
10683
10684       /* Convert the extracted vector element to the scalar type.  */
10685       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10686     }
10687   else
10688     {
10689       tree bftype = TREE_TYPE (vectype);
10690       if (VECTOR_BOOLEAN_TYPE_P (vectype))
10691         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10692       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10693       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10694                                        &stmts, true, NULL_TREE);
10695     }
10696
10697   *exit_gsi = gsi_after_labels (exit_bb);
10698   if (stmts)
10699     gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10700
10701   return new_tree;
10702 }
10703
10704 /* Find the edge that's the final one in the path from SRC to DEST and
10705    return it.  This edge must exist in at most one forwarder edge between.  */
10706
10707 static edge
10708 find_connected_edge (edge src, basic_block dest)
10709 {
10710    if (src->dest == dest)
10711      return src;
10712
10713   return find_edge (src->dest, dest);
10714 }
10715
10716 /* Function vectorizable_live_operation.
10717
10718    STMT_INFO computes a value that is used outside the loop.  Check if
10719    it can be supported.  */
10720
10721 bool
10722 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10723                              slp_tree slp_node, slp_instance slp_node_instance,
10724                              int slp_index, bool vec_stmt_p,
10725                              stmt_vector_for_cost *cost_vec)
10726 {
10727   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10728   imm_use_iterator imm_iter;
10729   tree lhs, lhs_type, bitsize;
10730   tree vectype = (slp_node
10731                   ? SLP_TREE_VECTYPE (slp_node)
10732                   : STMT_VINFO_VECTYPE (stmt_info));
10733   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10734   int ncopies;
10735   gimple *use_stmt;
10736   use_operand_p use_p;
10737   auto_vec<tree> vec_oprnds;
10738   int vec_entry = 0;
10739   poly_uint64 vec_index = 0;
10740
10741   gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10742               || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10743
10744   /* If a stmt of a reduction is live, vectorize it via
10745      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
10746      validity so just trigger the transform here.  */
10747   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10748     {
10749       if (!vec_stmt_p)
10750         return true;
10751       if (slp_node)
10752         {
10753           /* For reduction chains the meta-info is attached to
10754              the group leader.  */
10755           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10756             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10757           /* For SLP reductions we vectorize the epilogue for
10758              all involved stmts together.  */
10759           else if (slp_index != 0)
10760             return true;
10761         }
10762       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10763       gcc_assert (reduc_info->is_reduc_info);
10764       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10765           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10766         return true;
10767
10768       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10769                                         slp_node_instance,
10770                                         LOOP_VINFO_IV_EXIT (loop_vinfo));
10771
10772       /* If early break we only have to materialize the reduction on the merge
10773          block, but we have to find an alternate exit first.  */
10774       if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10775         {
10776           for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10777             if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
10778               {
10779                 vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10780                                                   slp_node, slp_node_instance,
10781                                                   exit);
10782                 break;
10783               }
10784         }
10785
10786       return true;
10787     }
10788
10789   /* If STMT is not relevant and it is a simple assignment and its inputs are
10790      invariant then it can remain in place, unvectorized.  The original last
10791      scalar value that it computes will be used.  */
10792   if (!STMT_VINFO_RELEVANT_P (stmt_info))
10793     {
10794       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10795       if (dump_enabled_p ())
10796         dump_printf_loc (MSG_NOTE, vect_location,
10797                          "statement is simple and uses invariant.  Leaving in "
10798                          "place.\n");
10799       return true;
10800     }
10801
10802   if (slp_node)
10803     ncopies = 1;
10804   else
10805     ncopies = vect_get_num_copies (loop_vinfo, vectype);
10806
10807   if (slp_node)
10808     {
10809       gcc_assert (slp_index >= 0);
10810
10811       /* Get the last occurrence of the scalar index from the concatenation of
10812          all the slp vectors. Calculate which slp vector it is and the index
10813          within.  */
10814       int num_scalar = SLP_TREE_LANES (slp_node);
10815       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10816       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10817
10818       /* Calculate which vector contains the result, and which lane of
10819          that vector we need.  */
10820       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10821         {
10822           if (dump_enabled_p ())
10823             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10824                              "Cannot determine which vector holds the"
10825                              " final result.\n");
10826           return false;
10827         }
10828     }
10829
10830   if (!vec_stmt_p)
10831     {
10832       /* No transformation required.  */
10833       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10834         {
10835           if (slp_node)
10836             {
10837               if (dump_enabled_p ())
10838                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10839                                  "can't operate on partial vectors "
10840                                  "because an SLP statement is live after "
10841                                  "the loop.\n");
10842               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10843             }
10844           else if (ncopies > 1)
10845             {
10846               if (dump_enabled_p ())
10847                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10848                                  "can't operate on partial vectors "
10849                                  "because ncopies is greater than 1.\n");
10850               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10851             }
10852           else
10853             {
10854               gcc_assert (ncopies == 1 && !slp_node);
10855               if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10856                                                   OPTIMIZE_FOR_SPEED))
10857                 vect_record_loop_mask (loop_vinfo,
10858                                        &LOOP_VINFO_MASKS (loop_vinfo),
10859                                        1, vectype, NULL);
10860               else if (can_vec_extract_var_idx_p (
10861                          TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10862                 vect_record_loop_len (loop_vinfo,
10863                                       &LOOP_VINFO_LENS (loop_vinfo),
10864                                       1, vectype, 1);
10865               else
10866                 {
10867                   if (dump_enabled_p ())
10868                     dump_printf_loc (
10869                       MSG_MISSED_OPTIMIZATION, vect_location,
10870                       "can't operate on partial vectors "
10871                       "because the target doesn't support extract "
10872                       "last reduction.\n");
10873                   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10874                 }
10875             }
10876         }
10877       /* ???  Enable for loop costing as well.  */
10878       if (!loop_vinfo)
10879         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10880                           0, vect_epilogue);
10881       return true;
10882     }
10883
10884   /* Use the lhs of the original scalar statement.  */
10885   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10886   if (dump_enabled_p ())
10887     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10888                      "stmt %G", stmt);
10889
10890   lhs = gimple_get_lhs (stmt);
10891   lhs_type = TREE_TYPE (lhs);
10892
10893   bitsize = vector_element_bits_tree (vectype);
10894
10895   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
10896   tree vec_lhs, vec_lhs0, bitstart;
10897   gimple *vec_stmt, *vec_stmt0;
10898   if (slp_node)
10899     {
10900       gcc_assert (!loop_vinfo
10901                   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10902                       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10903
10904       /* Get the correct slp vectorized stmt.  */
10905       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10906       vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10907
10908       /* In case we need to early break vectorize also get the first stmt.  */
10909       vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10910       vec_stmt0 = SSA_NAME_DEF_STMT (vec_lhs0);
10911
10912       /* Get entry to use.  */
10913       bitstart = bitsize_int (vec_index);
10914       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10915     }
10916   else
10917     {
10918       /* For multiple copies, get the last copy.  */
10919       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10920       vec_lhs = gimple_get_lhs (vec_stmt);
10921
10922       /* In case we need to early break vectorize also get the first stmt.  */
10923       vec_stmt0 = STMT_VINFO_VEC_STMTS (stmt_info)[0];
10924       vec_lhs0 = gimple_get_lhs (vec_stmt0);
10925
10926       /* Get the last lane in the vector.  */
10927       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10928     }
10929
10930   if (loop_vinfo)
10931     {
10932       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10933          requirement, insert one phi node for it.  It looks like:
10934            loop;
10935          BB:
10936            # lhs' = PHI <lhs>
10937          ==>
10938            loop;
10939          BB:
10940            # vec_lhs' = PHI <vec_lhs>
10941            new_tree = lane_extract <vec_lhs', ...>;
10942            lhs' = new_tree;  */
10943
10944       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10945       /* Check if we have a loop where the chosen exit is not the main exit,
10946          in these cases for an early break we restart the iteration the vector code
10947          did.  For the live values we want the value at the start of the iteration
10948          rather than at the end.  */
10949       edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
10950       bool restart_loop = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10951       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10952         if (!is_gimple_debug (use_stmt)
10953             && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10954           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10955             {
10956               edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10957                                            phi_arg_index_from_use (use_p));
10958               bool main_exit_edge = e == main_e
10959                                     || find_connected_edge (main_e, e->src);
10960
10961               /* Early exits have an merge block, we want the merge block itself
10962                  so use ->src.  For main exit the merge block is the
10963                  destination.  */
10964               basic_block dest = main_exit_edge ? main_e->dest : e->src;
10965               tree tmp_vec_lhs = vec_lhs;
10966               tree tmp_bitstart = bitstart;
10967
10968               /* For early exit where the exit is not in the BB that leads
10969                  to the latch then we're restarting the iteration in the
10970                  scalar loop.  So get the first live value.  */
10971               restart_loop = restart_loop || !main_exit_edge;
10972               if (restart_loop
10973                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
10974                 {
10975                   tmp_vec_lhs = vec_lhs0;
10976                   tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10977                 }
10978
10979               gimple_stmt_iterator exit_gsi;
10980               tree new_tree
10981                 = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
10982                                                  dest, vectype, ncopies,
10983                                                  slp_node, bitsize,
10984                                                  tmp_bitstart, tmp_vec_lhs,
10985                                                  lhs_type, restart_loop,
10986                                                  &exit_gsi);
10987
10988               if (gimple_phi_num_args (use_stmt) == 1)
10989                 {
10990                   auto gsi = gsi_for_stmt (use_stmt);
10991                   remove_phi_node (&gsi, false);
10992                   tree lhs_phi = gimple_phi_result (use_stmt);
10993                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10994                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10995                 }
10996               else
10997                 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, new_tree);
10998           }
10999
11000       /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
11001       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11002         gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
11003     }
11004   else
11005     {
11006       /* For basic-block vectorization simply insert the lane-extraction.  */
11007       tree bftype = TREE_TYPE (vectype);
11008       if (VECTOR_BOOLEAN_TYPE_P (vectype))
11009         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
11010       tree new_tree = build3 (BIT_FIELD_REF, bftype,
11011                               vec_lhs, bitsize, bitstart);
11012       gimple_seq stmts = NULL;
11013       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
11014                                        &stmts, true, NULL_TREE);
11015       if (TREE_CODE (new_tree) == SSA_NAME
11016           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
11017         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
11018       if (is_a <gphi *> (vec_stmt))
11019         {
11020           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
11021           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
11022         }
11023       else
11024         {
11025           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
11026           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
11027         }
11028
11029       /* Replace use of lhs with newly computed result.  If the use stmt is a
11030          single arg PHI, just replace all uses of PHI result.  It's necessary
11031          because lcssa PHI defining lhs may be before newly inserted stmt.  */
11032       use_operand_p use_p;
11033       stmt_vec_info use_stmt_info;
11034       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11035         if (!is_gimple_debug (use_stmt)
11036             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
11037                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
11038           {
11039             /* ???  This can happen when the live lane ends up being
11040                rooted in a vector construction code-generated by an
11041                external SLP node (and code-generation for that already
11042                happened).  See gcc.dg/vect/bb-slp-47.c.
11043                Doing this is what would happen if that vector CTOR
11044                were not code-generated yet so it is not too bad.
11045                ???  In fact we'd likely want to avoid this situation
11046                in the first place.  */
11047             if (TREE_CODE (new_tree) == SSA_NAME
11048                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11049                 && gimple_code (use_stmt) != GIMPLE_PHI
11050                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
11051                                                 use_stmt))
11052               {
11053                 if (dump_enabled_p ())
11054                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11055                                    "Using original scalar computation for "
11056                                    "live lane because use preceeds vector "
11057                                    "def\n");
11058                 continue;
11059               }
11060             /* ???  It can also happen that we end up pulling a def into
11061                a loop where replacing out-of-loop uses would require
11062                a new LC SSA PHI node.  Retain the original scalar in
11063                those cases as well.  PR98064.  */
11064             if (TREE_CODE (new_tree) == SSA_NAME
11065                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11066                 && (gimple_bb (use_stmt)->loop_father
11067                     != gimple_bb (vec_stmt)->loop_father)
11068                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
11069                                         gimple_bb (use_stmt)->loop_father))
11070               {
11071                 if (dump_enabled_p ())
11072                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11073                                    "Using original scalar computation for "
11074                                    "live lane because there is an out-of-loop "
11075                                    "definition for it\n");
11076                 continue;
11077               }
11078             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
11079               SET_USE (use_p, new_tree);
11080             update_stmt (use_stmt);
11081           }
11082     }
11083
11084   return true;
11085 }
11086
11087 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
11088
11089 static void
11090 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
11091 {
11092   ssa_op_iter op_iter;
11093   imm_use_iterator imm_iter;
11094   def_operand_p def_p;
11095   gimple *ustmt;
11096
11097   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
11098     {
11099       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
11100         {
11101           basic_block bb;
11102
11103           if (!is_gimple_debug (ustmt))
11104             continue;
11105
11106           bb = gimple_bb (ustmt);
11107
11108           if (!flow_bb_inside_loop_p (loop, bb))
11109             {
11110               if (gimple_debug_bind_p (ustmt))
11111                 {
11112                   if (dump_enabled_p ())
11113                     dump_printf_loc (MSG_NOTE, vect_location,
11114                                      "killing debug use\n");
11115
11116                   gimple_debug_bind_reset_value (ustmt);
11117                   update_stmt (ustmt);
11118                 }
11119               else
11120                 gcc_unreachable ();
11121             }
11122         }
11123     }
11124 }
11125
11126 /* Given loop represented by LOOP_VINFO, return true if computation of
11127    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
11128    otherwise.  */
11129
11130 static bool
11131 loop_niters_no_overflow (loop_vec_info loop_vinfo)
11132 {
11133   /* Constant case.  */
11134   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
11135     {
11136       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
11137       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
11138
11139       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
11140       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
11141       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
11142         return true;
11143     }
11144
11145   widest_int max;
11146   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11147   /* Check the upper bound of loop niters.  */
11148   if (get_max_loop_iterations (loop, &max))
11149     {
11150       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
11151       signop sgn = TYPE_SIGN (type);
11152       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
11153       if (max < type_max)
11154         return true;
11155     }
11156   return false;
11157 }
11158
11159 /* Return a mask type with half the number of elements as OLD_TYPE,
11160    given that it should have mode NEW_MODE.  */
11161
11162 tree
11163 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
11164 {
11165   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
11166   return build_truth_vector_type_for_mode (nunits, new_mode);
11167 }
11168
11169 /* Return a mask type with twice as many elements as OLD_TYPE,
11170    given that it should have mode NEW_MODE.  */
11171
11172 tree
11173 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
11174 {
11175   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
11176   return build_truth_vector_type_for_mode (nunits, new_mode);
11177 }
11178
11179 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
11180    contain a sequence of NVECTORS masks that each control a vector of type
11181    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
11182    these vector masks with the vector version of SCALAR_MASK.  */
11183
11184 void
11185 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
11186                        unsigned int nvectors, tree vectype, tree scalar_mask)
11187 {
11188   gcc_assert (nvectors != 0);
11189
11190   if (scalar_mask)
11191     {
11192       scalar_cond_masked_key cond (scalar_mask, nvectors);
11193       loop_vinfo->scalar_cond_masked_set.add (cond);
11194     }
11195
11196   masks->mask_set.add (std::make_pair (vectype, nvectors));
11197 }
11198
11199 /* Given a complete set of masks MASKS, extract mask number INDEX
11200    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11201    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
11202
11203    See the comment above vec_loop_masks for more details about the mask
11204    arrangement.  */
11205
11206 tree
11207 vect_get_loop_mask (loop_vec_info loop_vinfo,
11208                     gimple_stmt_iterator *gsi, vec_loop_masks *masks,
11209                     unsigned int nvectors, tree vectype, unsigned int index)
11210 {
11211   if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11212       == vect_partial_vectors_while_ult)
11213     {
11214       rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
11215       tree mask_type = rgm->type;
11216
11217       /* Populate the rgroup's mask array, if this is the first time we've
11218          used it.  */
11219       if (rgm->controls.is_empty ())
11220         {
11221           rgm->controls.safe_grow_cleared (nvectors, true);
11222           for (unsigned int i = 0; i < nvectors; ++i)
11223             {
11224               tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
11225               /* Provide a dummy definition until the real one is available.  */
11226               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11227               rgm->controls[i] = mask;
11228             }
11229         }
11230
11231       tree mask = rgm->controls[index];
11232       if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
11233                     TYPE_VECTOR_SUBPARTS (vectype)))
11234         {
11235           /* A loop mask for data type X can be reused for data type Y
11236              if X has N times more elements than Y and if Y's elements
11237              are N times bigger than X's.  In this case each sequence
11238              of N elements in the loop mask will be all-zero or all-one.
11239              We can then view-convert the mask so that each sequence of
11240              N elements is replaced by a single element.  */
11241           gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
11242                                   TYPE_VECTOR_SUBPARTS (vectype)));
11243           gimple_seq seq = NULL;
11244           mask_type = truth_type_for (vectype);
11245           mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
11246           if (seq)
11247             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11248         }
11249       return mask;
11250     }
11251   else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11252            == vect_partial_vectors_avx512)
11253     {
11254       /* The number of scalars per iteration and the number of vectors are
11255          both compile-time constants.  */
11256       unsigned int nscalars_per_iter
11257         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11258                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11259
11260       rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11261
11262       /* The stored nV is dependent on the mask type produced.  */
11263       gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11264                              TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11265                   == rgm->factor);
11266       nvectors = rgm->factor;
11267
11268       /* Populate the rgroup's mask array, if this is the first time we've
11269          used it.  */
11270       if (rgm->controls.is_empty ())
11271         {
11272           rgm->controls.safe_grow_cleared (nvectors, true);
11273           for (unsigned int i = 0; i < nvectors; ++i)
11274             {
11275               tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11276               /* Provide a dummy definition until the real one is available.  */
11277               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11278               rgm->controls[i] = mask;
11279             }
11280         }
11281       if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11282                     TYPE_VECTOR_SUBPARTS (vectype)))
11283         return rgm->controls[index];
11284
11285       /* Split the vector if needed.  Since we are dealing with integer mode
11286          masks with AVX512 we can operate on the integer representation
11287          performing the whole vector shifting.  */
11288       unsigned HOST_WIDE_INT factor;
11289       bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11290                                      TYPE_VECTOR_SUBPARTS (vectype), &factor);
11291       gcc_assert (ok);
11292       gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11293       tree mask_type = truth_type_for (vectype);
11294       gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11295       unsigned vi = index / factor;
11296       unsigned vpart = index % factor;
11297       tree vec = rgm->controls[vi];
11298       gimple_seq seq = NULL;
11299       vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11300                           lang_hooks.types.type_for_mode
11301                                 (TYPE_MODE (rgm->type), 1), vec);
11302       /* For integer mode masks simply shift the right bits into position.  */
11303       if (vpart != 0)
11304         vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11305                             build_int_cst (integer_type_node,
11306                                            (TYPE_VECTOR_SUBPARTS (vectype)
11307                                             * vpart)));
11308       vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11309                                     (TYPE_MODE (mask_type), 1), vec);
11310       vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11311       if (seq)
11312         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11313       return vec;
11314     }
11315   else
11316     gcc_unreachable ();
11317 }
11318
11319 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11320    lengths for controlling an operation on VECTYPE.  The operation splits
11321    each element of VECTYPE into FACTOR separate subelements, measuring the
11322    length as a number of these subelements.  */
11323
11324 void
11325 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11326                       unsigned int nvectors, tree vectype, unsigned int factor)
11327 {
11328   gcc_assert (nvectors != 0);
11329   if (lens->length () < nvectors)
11330     lens->safe_grow_cleared (nvectors, true);
11331   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11332
11333   /* The number of scalars per iteration, scalar occupied bytes and
11334      the number of vectors are both compile-time constants.  */
11335   unsigned int nscalars_per_iter
11336     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11337                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11338
11339   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11340     {
11341       /* For now, we only support cases in which all loads and stores fall back
11342          to VnQI or none do.  */
11343       gcc_assert (!rgl->max_nscalars_per_iter
11344                   || (rgl->factor == 1 && factor == 1)
11345                   || (rgl->max_nscalars_per_iter * rgl->factor
11346                       == nscalars_per_iter * factor));
11347       rgl->max_nscalars_per_iter = nscalars_per_iter;
11348       rgl->type = vectype;
11349       rgl->factor = factor;
11350     }
11351 }
11352
11353 /* Given a complete set of lengths LENS, extract length number INDEX
11354    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11355    where 0 <= INDEX < NVECTORS.  Return a value that contains FACTOR
11356    multipled by the number of elements that should be processed.
11357    Insert any set-up statements before GSI.  */
11358
11359 tree
11360 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11361                    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11362                    unsigned int index, unsigned int factor)
11363 {
11364   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11365   bool use_bias_adjusted_len =
11366     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11367
11368   /* Populate the rgroup's len array, if this is the first time we've
11369      used it.  */
11370   if (rgl->controls.is_empty ())
11371     {
11372       rgl->controls.safe_grow_cleared (nvectors, true);
11373       for (unsigned int i = 0; i < nvectors; ++i)
11374         {
11375           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11376           gcc_assert (len_type != NULL_TREE);
11377
11378           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11379
11380           /* Provide a dummy definition until the real one is available.  */
11381           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11382           rgl->controls[i] = len;
11383
11384           if (use_bias_adjusted_len)
11385             {
11386               gcc_assert (i == 0);
11387               tree adjusted_len =
11388                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11389               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11390               rgl->bias_adjusted_ctrl = adjusted_len;
11391             }
11392         }
11393     }
11394
11395   if (use_bias_adjusted_len)
11396     return rgl->bias_adjusted_ctrl;
11397
11398   tree loop_len = rgl->controls[index];
11399   if (rgl->factor == 1 && factor == 1)
11400     {
11401       poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11402       poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11403       if (maybe_ne (nunits1, nunits2))
11404         {
11405           /* A loop len for data type X can be reused for data type Y
11406              if X has N times more elements than Y and if Y's elements
11407              are N times bigger than X's.  */
11408           gcc_assert (multiple_p (nunits1, nunits2));
11409           factor = exact_div (nunits1, nunits2).to_constant ();
11410           tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11411           gimple_seq seq = NULL;
11412           loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11413                                    build_int_cst (iv_type, factor));
11414           if (seq)
11415             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11416         }
11417     }
11418   return loop_len;
11419 }
11420
11421 /* Scale profiling counters by estimation for LOOP which is vectorized
11422    by factor VF.
11423    If FLAT is true, the loop we started with had unrealistically flat
11424    profile.  */
11425
11426 static void
11427 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11428 {
11429   /* For flat profiles do not scale down proportionally by VF and only
11430      cap by known iteration count bounds.  */
11431   if (flat)
11432     {
11433       if (dump_file && (dump_flags & TDF_DETAILS))
11434         fprintf (dump_file,
11435                  "Vectorized loop profile seems flat; not scaling iteration "
11436                  "count down by the vectorization factor %i\n", vf);
11437       scale_loop_profile (loop, profile_probability::always (),
11438                           get_likely_max_loop_iterations_int (loop));
11439       return;
11440     }
11441   /* Loop body executes VF fewer times and exit increases VF times.  */
11442   profile_count entry_count = loop_preheader_edge (loop)->count ();
11443
11444   /* If we have unreliable loop profile avoid dropping entry
11445      count bellow header count.  This can happen since loops
11446      has unrealistically low trip counts.  */
11447   while (vf > 1
11448          && loop->header->count > entry_count
11449          && loop->header->count < entry_count * vf)
11450     {
11451       if (dump_file && (dump_flags & TDF_DETAILS))
11452         fprintf (dump_file,
11453                  "Vectorization factor %i seems too large for profile "
11454                  "prevoiusly believed to be consistent; reducing.\n", vf);
11455       vf /= 2;
11456     }
11457
11458   if (entry_count.nonzero_p ())
11459     set_edge_probability_and_rescale_others
11460             (exit_e,
11461              entry_count.probability_in (loop->header->count / vf));
11462   /* Avoid producing very large exit probability when we do not have
11463      sensible profile.  */
11464   else if (exit_e->probability < profile_probability::always () / (vf * 2))
11465     set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11466   loop->latch->count = single_pred_edge (loop->latch)->count ();
11467
11468   scale_loop_profile (loop, profile_probability::always () / vf,
11469                       get_likely_max_loop_iterations_int (loop));
11470 }
11471
11472 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11473    latch edge values originally defined by it.  */
11474
11475 static void
11476 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11477                                      stmt_vec_info def_stmt_info)
11478 {
11479   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11480   if (!def || TREE_CODE (def) != SSA_NAME)
11481     return;
11482   stmt_vec_info phi_info;
11483   imm_use_iterator iter;
11484   use_operand_p use_p;
11485   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11486     {
11487       gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11488       if (!phi)
11489         continue;
11490       if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11491             && (phi_info = loop_vinfo->lookup_stmt (phi))
11492             && STMT_VINFO_RELEVANT_P (phi_info)))
11493         continue;
11494       loop_p loop = gimple_bb (phi)->loop_father;
11495       edge e = loop_latch_edge (loop);
11496       if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11497         continue;
11498
11499       if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11500           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11501           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11502         {
11503           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11504           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11505           gcc_assert (phi_defs.length () == latch_defs.length ());
11506           for (unsigned i = 0; i < phi_defs.length (); ++i)
11507             add_phi_arg (as_a <gphi *> (phi_defs[i]),
11508                          gimple_get_lhs (latch_defs[i]), e,
11509                          gimple_phi_arg_location (phi, e->dest_idx));
11510         }
11511       else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11512         {
11513           /* For first order recurrences we have to update both uses of
11514              the latch definition, the one in the PHI node and the one
11515              in the generated VEC_PERM_EXPR.  */
11516           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11517           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11518           gcc_assert (phi_defs.length () == latch_defs.length ());
11519           tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11520           gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11521           for (unsigned i = 0; i < phi_defs.length (); ++i)
11522             {
11523               gassign *perm = as_a <gassign *> (phi_defs[i]);
11524               if (i > 0)
11525                 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11526               gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11527               update_stmt (perm);
11528             }
11529           add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11530                        gimple_phi_arg_location (phi, e->dest_idx));
11531         }
11532     }
11533 }
11534
11535 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11536    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11537    stmt_vec_info.  */
11538
11539 static bool
11540 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11541                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11542 {
11543   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11544   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11545
11546   if (dump_enabled_p ())
11547     dump_printf_loc (MSG_NOTE, vect_location,
11548                      "------>vectorizing statement: %G", stmt_info->stmt);
11549
11550   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11551     vect_loop_kill_debug_uses (loop, stmt_info);
11552
11553   if (!STMT_VINFO_RELEVANT_P (stmt_info)
11554       && !STMT_VINFO_LIVE_P (stmt_info))
11555     {
11556       if (is_gimple_call (stmt_info->stmt)
11557           && gimple_call_internal_p (stmt_info->stmt, IFN_MASK_CALL))
11558         {
11559           gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11560           *seen_store = stmt_info;
11561           return false;
11562         }
11563       return false;
11564     }
11565
11566   if (STMT_VINFO_VECTYPE (stmt_info))
11567     {
11568       poly_uint64 nunits
11569         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11570       if (!STMT_SLP_TYPE (stmt_info)
11571           && maybe_ne (nunits, vf)
11572           && dump_enabled_p ())
11573         /* For SLP VF is set according to unrolling factor, and not
11574            to vector size, hence for SLP this print is not valid.  */
11575         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11576     }
11577
11578   /* Pure SLP statements have already been vectorized.  We still need
11579      to apply loop vectorization to hybrid SLP statements.  */
11580   if (PURE_SLP_STMT (stmt_info))
11581     return false;
11582
11583   if (dump_enabled_p ())
11584     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11585
11586   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11587     *seen_store = stmt_info;
11588
11589   return true;
11590 }
11591
11592 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11593    in the hash_map with its corresponding values.  */
11594
11595 static tree
11596 find_in_mapping (tree t, void *context)
11597 {
11598   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11599
11600   tree *value = mapping->get (t);
11601   return value ? *value : t;
11602 }
11603
11604 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
11605    original loop that has now been vectorized.
11606
11607    The inits of the data_references need to be advanced with the number of
11608    iterations of the main loop.  This has been computed in vect_do_peeling and
11609    is stored in parameter ADVANCE.  We first restore the data_references
11610    initial offset with the values recored in ORIG_DRS_INIT.
11611
11612    Since the loop_vec_info of this EPILOGUE was constructed for the original
11613    loop, its stmt_vec_infos all point to the original statements.  These need
11614    to be updated to point to their corresponding copies as well as the SSA_NAMES
11615    in their PATTERN_DEF_SEQs and RELATED_STMTs.
11616
11617    The data_reference's connections also need to be updated.  Their
11618    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11619    stmt_vec_infos, their statements need to point to their corresponding copy,
11620    if they are gather loads or scatter stores then their reference needs to be
11621    updated to point to its corresponding copy and finally we set
11622    'base_misaligned' to false as we have already peeled for alignment in the
11623    prologue of the main loop.  */
11624
11625 static void
11626 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11627 {
11628   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11629   auto_vec<gimple *> stmt_worklist;
11630   hash_map<tree,tree> mapping;
11631   gimple *orig_stmt, *new_stmt;
11632   gimple_stmt_iterator epilogue_gsi;
11633   gphi_iterator epilogue_phi_gsi;
11634   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11635   basic_block *epilogue_bbs = get_loop_body (epilogue);
11636   unsigned i;
11637
11638   free (LOOP_VINFO_BBS (epilogue_vinfo));
11639   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11640
11641   /* Advance data_reference's with the number of iterations of the previous
11642      loop and its prologue.  */
11643   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11644
11645
11646   /* The EPILOGUE loop is a copy of the original loop so they share the same
11647      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
11648      point to the copied statements.  We also create a mapping of all LHS' in
11649      the original loop and all the LHS' in the EPILOGUE and create worklists to
11650      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
11651   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11652     {
11653       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11654            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11655         {
11656           new_stmt = epilogue_phi_gsi.phi ();
11657
11658           gcc_assert (gimple_uid (new_stmt) > 0);
11659           stmt_vinfo
11660             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11661
11662           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11663           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11664
11665           mapping.put (gimple_phi_result (orig_stmt),
11666                        gimple_phi_result (new_stmt));
11667           /* PHI nodes can not have patterns or related statements.  */
11668           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11669                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11670         }
11671
11672       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11673            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11674         {
11675           new_stmt = gsi_stmt (epilogue_gsi);
11676           if (is_gimple_debug (new_stmt))
11677             continue;
11678
11679           gcc_assert (gimple_uid (new_stmt) > 0);
11680           stmt_vinfo
11681             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11682
11683           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11684           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11685
11686           if (tree old_lhs = gimple_get_lhs (orig_stmt))
11687             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11688
11689           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11690             {
11691               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11692               for (gimple_stmt_iterator gsi = gsi_start (seq);
11693                    !gsi_end_p (gsi); gsi_next (&gsi))
11694                 stmt_worklist.safe_push (gsi_stmt (gsi));
11695             }
11696
11697           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11698           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11699             {
11700               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11701               stmt_worklist.safe_push (stmt);
11702               /* Set BB such that the assert in
11703                 'get_initial_def_for_reduction' is able to determine that
11704                 the BB of the related stmt is inside this loop.  */
11705               gimple_set_bb (stmt,
11706                              gimple_bb (new_stmt));
11707               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11708               gcc_assert (related_vinfo == NULL
11709                           || related_vinfo == stmt_vinfo);
11710             }
11711         }
11712     }
11713
11714   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11715      using the original main loop and thus need to be updated to refer to the
11716      cloned variables used in the epilogue.  */
11717   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11718     {
11719       gimple *stmt = stmt_worklist[i];
11720       tree *new_op;
11721
11722       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11723         {
11724           tree op = gimple_op (stmt, j);
11725           if ((new_op = mapping.get(op)))
11726             gimple_set_op (stmt, j, *new_op);
11727           else
11728             {
11729               /* PR92429: The last argument of simplify_replace_tree disables
11730                  folding when replacing arguments.  This is required as
11731                  otherwise you might end up with different statements than the
11732                  ones analyzed in vect_loop_analyze, leading to different
11733                  vectorization.  */
11734               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11735                                           &find_in_mapping, &mapping, false);
11736               gimple_set_op (stmt, j, op);
11737             }
11738         }
11739     }
11740
11741   struct data_reference *dr;
11742   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11743   FOR_EACH_VEC_ELT (datarefs, i, dr)
11744     {
11745       orig_stmt = DR_STMT (dr);
11746       gcc_assert (gimple_uid (orig_stmt) > 0);
11747       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11748       /* Data references for gather loads and scatter stores do not use the
11749          updated offset we set using ADVANCE.  Instead we have to make sure the
11750          reference in the data references point to the corresponding copy of
11751          the original in the epilogue.  Make sure to update both
11752          gather/scatters recognized by dataref analysis and also other
11753          refs that get_load_store_type classified as VMAT_GATHER_SCATTER.  */
11754       auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11755       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11756           || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11757         {
11758           DR_REF (dr)
11759             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11760                                      &find_in_mapping, &mapping);
11761           DR_BASE_ADDRESS (dr)
11762             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11763                                      &find_in_mapping, &mapping);
11764         }
11765       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11766       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11767       /* The vector size of the epilogue is smaller than that of the main loop
11768          so the alignment is either the same or lower. This means the dr will
11769          thus by definition be aligned.  */
11770       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11771     }
11772
11773   epilogue_vinfo->shared->datarefs_copy.release ();
11774   epilogue_vinfo->shared->save_datarefs ();
11775 }
11776
11777 /*  When vectorizing early break statements instructions that happen before
11778     the early break in the current BB need to be moved to after the early
11779     break.  This function deals with that and assumes that any validity
11780     checks has already been performed.
11781
11782     While moving the instructions if it encounters a VUSE or VDEF it then
11783     corrects the VUSES as it moves the statements along.  GDEST is the location
11784     in which to insert the new statements.  */
11785
11786 static void
11787 move_early_exit_stmts (loop_vec_info loop_vinfo)
11788 {
11789   DUMP_VECT_SCOPE ("move_early_exit_stmts");
11790
11791   if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
11792     return;
11793
11794   /* Move all stmts that need moving.  */
11795   basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
11796   gimple_stmt_iterator dest_gsi = gsi_start_bb (dest_bb);
11797
11798   for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
11799     {
11800       /* Check to see if statement is still required for vect or has been
11801          elided.  */
11802       auto stmt_info = loop_vinfo->lookup_stmt (stmt);
11803       if (!stmt_info)
11804         continue;
11805
11806       if (dump_enabled_p ())
11807         dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
11808
11809       gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
11810       gsi_move_before (&stmt_gsi, &dest_gsi);
11811       gsi_prev (&dest_gsi);
11812     }
11813
11814   /* Update all the stmts with their new reaching VUSES.  */
11815   tree vuse
11816     = gimple_vuse (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).last ());
11817   for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
11818     {
11819       if (dump_enabled_p ())
11820           dump_printf_loc (MSG_NOTE, vect_location,
11821                            "updating vuse to %T for load %G", vuse, p);
11822       gimple_set_vuse (p, vuse);
11823       update_stmt (p);
11824     }
11825 }
11826
11827 /* Function vect_transform_loop.
11828
11829    The analysis phase has determined that the loop is vectorizable.
11830    Vectorize the loop - created vectorized stmts to replace the scalar
11831    stmts in the loop, and update the loop exit condition.
11832    Returns scalar epilogue loop if any.  */
11833
11834 class loop *
11835 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11836 {
11837   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11838   class loop *epilogue = NULL;
11839   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11840   int nbbs = loop->num_nodes;
11841   int i;
11842   tree niters_vector = NULL_TREE;
11843   tree step_vector = NULL_TREE;
11844   tree niters_vector_mult_vf = NULL_TREE;
11845   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11846   unsigned int lowest_vf = constant_lower_bound (vf);
11847   gimple *stmt;
11848   bool check_profitability = false;
11849   unsigned int th;
11850   bool flat = maybe_flat_loop_profile (loop);
11851
11852   DUMP_VECT_SCOPE ("vec_transform_loop");
11853
11854   loop_vinfo->shared->check_datarefs ();
11855
11856   /* Use the more conservative vectorization threshold.  If the number
11857      of iterations is constant assume the cost check has been performed
11858      by our caller.  If the threshold makes all loops profitable that
11859      run at least the (estimated) vectorization factor number of times
11860      checking is pointless, too.  */
11861   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11862   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11863     {
11864       if (dump_enabled_p ())
11865         dump_printf_loc (MSG_NOTE, vect_location,
11866                          "Profitability threshold is %d loop iterations.\n",
11867                          th);
11868       check_profitability = true;
11869     }
11870
11871   /* Make sure there exists a single-predecessor exit bb.  Do this before
11872      versioning.   */
11873   edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11874   if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11875     {
11876       split_loop_exit_edge (e, true);
11877       if (dump_enabled_p ())
11878         dump_printf (MSG_NOTE, "split exit edge\n");
11879     }
11880
11881   /* Version the loop first, if required, so the profitability check
11882      comes first.  */
11883
11884   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11885     {
11886       class loop *sloop
11887         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11888       sloop->force_vectorize = false;
11889       check_profitability = false;
11890     }
11891
11892   /* Make sure there exists a single-predecessor exit bb also on the
11893      scalar loop copy.  Do this after versioning but before peeling
11894      so CFG structure is fine for both scalar and if-converted loop
11895      to make slpeel_duplicate_current_defs_from_edges face matched
11896      loop closed PHI nodes on the exit.  */
11897   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11898     {
11899       e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11900       if (! single_pred_p (e->dest))
11901         {
11902           split_loop_exit_edge (e, true);
11903           if (dump_enabled_p ())
11904             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11905         }
11906     }
11907
11908   tree niters = vect_build_loop_niters (loop_vinfo);
11909   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11910   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11911   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11912   tree advance;
11913   drs_init_vec orig_drs_init;
11914
11915   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11916                               &step_vector, &niters_vector_mult_vf, th,
11917                               check_profitability, niters_no_overflow,
11918                               &advance);
11919   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11920       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11921     {
11922       /* Ifcvt duplicates loop preheader, loop body and produces an basic
11923          block after loop exit.  We need to scale all that.  */
11924       basic_block preheader
11925         = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11926       preheader->count
11927         = preheader->count.apply_probability
11928               (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11929       scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11930                               LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11931       single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->dest->count
11932         = preheader->count;
11933     }
11934
11935   if (niters_vector == NULL_TREE)
11936     {
11937       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11938           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11939           && known_eq (lowest_vf, vf))
11940         {
11941           niters_vector
11942             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11943                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11944           step_vector = build_one_cst (TREE_TYPE (niters));
11945         }
11946       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11947         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11948                                      &step_vector, niters_no_overflow);
11949       else
11950         /* vect_do_peeling subtracted the number of peeled prologue
11951            iterations from LOOP_VINFO_NITERS.  */
11952         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11953                                      &niters_vector, &step_vector,
11954                                      niters_no_overflow);
11955     }
11956
11957   /* 1) Make sure the loop header has exactly two entries
11958      2) Make sure we have a preheader basic block.  */
11959
11960   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11961
11962   split_edge (loop_preheader_edge (loop));
11963
11964   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11965     /* This will deal with any possible peeling.  */
11966     vect_prepare_for_masked_peels (loop_vinfo);
11967
11968   /* Handle any code motion that we need to for early-break vectorization after
11969      we've done peeling but just before we start vectorizing.  */
11970   if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11971     move_early_exit_stmts (loop_vinfo);
11972
11973   /* Schedule the SLP instances first, then handle loop vectorization
11974      below.  */
11975   if (!loop_vinfo->slp_instances.is_empty ())
11976     {
11977       DUMP_VECT_SCOPE ("scheduling SLP instances");
11978       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11979     }
11980
11981   /* FORNOW: the vectorizer supports only loops which body consist
11982      of one basic block (header + empty latch). When the vectorizer will
11983      support more involved loop forms, the order by which the BBs are
11984      traversed need to be reconsidered.  */
11985
11986   for (i = 0; i < nbbs; i++)
11987     {
11988       basic_block bb = bbs[i];
11989       stmt_vec_info stmt_info;
11990
11991       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11992            gsi_next (&si))
11993         {
11994           gphi *phi = si.phi ();
11995           if (dump_enabled_p ())
11996             dump_printf_loc (MSG_NOTE, vect_location,
11997                              "------>vectorizing phi: %G", (gimple *) phi);
11998           stmt_info = loop_vinfo->lookup_stmt (phi);
11999           if (!stmt_info)
12000             continue;
12001
12002           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
12003             vect_loop_kill_debug_uses (loop, stmt_info);
12004
12005           if (!STMT_VINFO_RELEVANT_P (stmt_info)
12006               && !STMT_VINFO_LIVE_P (stmt_info))
12007             continue;
12008
12009           if (STMT_VINFO_VECTYPE (stmt_info)
12010               && (maybe_ne
12011                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
12012               && dump_enabled_p ())
12013             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
12014
12015           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12016                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12017                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12018                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12019                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
12020                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
12021               && ! PURE_SLP_STMT (stmt_info))
12022             {
12023               if (dump_enabled_p ())
12024                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
12025               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
12026             }
12027         }
12028
12029       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
12030            gsi_next (&si))
12031         {
12032           gphi *phi = si.phi ();
12033           stmt_info = loop_vinfo->lookup_stmt (phi);
12034           if (!stmt_info)
12035             continue;
12036
12037           if (!STMT_VINFO_RELEVANT_P (stmt_info)
12038               && !STMT_VINFO_LIVE_P (stmt_info))
12039             continue;
12040
12041           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12042                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12043                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12044                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12045                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
12046                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
12047               && ! PURE_SLP_STMT (stmt_info))
12048             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
12049         }
12050
12051       for (gimple_stmt_iterator si = gsi_start_bb (bb);
12052            !gsi_end_p (si);)
12053         {
12054           stmt = gsi_stmt (si);
12055           /* During vectorization remove existing clobber stmts.  */
12056           if (gimple_clobber_p (stmt))
12057             {
12058               unlink_stmt_vdef (stmt);
12059               gsi_remove (&si, true);
12060               release_defs (stmt);
12061             }
12062           else
12063             {
12064               /* Ignore vector stmts created in the outer loop.  */
12065               stmt_info = loop_vinfo->lookup_stmt (stmt);
12066
12067               /* vector stmts created in the outer-loop during vectorization of
12068                  stmts in an inner-loop may not have a stmt_info, and do not
12069                  need to be vectorized.  */
12070               stmt_vec_info seen_store = NULL;
12071               if (stmt_info)
12072                 {
12073                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
12074                     {
12075                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
12076                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
12077                            !gsi_end_p (subsi); gsi_next (&subsi))
12078                         {
12079                           stmt_vec_info pat_stmt_info
12080                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
12081                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12082                                                     &si, &seen_store);
12083                         }
12084                       stmt_vec_info pat_stmt_info
12085                         = STMT_VINFO_RELATED_STMT (stmt_info);
12086                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12087                                                     &si, &seen_store))
12088                         maybe_set_vectorized_backedge_value (loop_vinfo,
12089                                                              pat_stmt_info);
12090                     }
12091                   else
12092                     {
12093                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
12094                                                     &seen_store))
12095                         maybe_set_vectorized_backedge_value (loop_vinfo,
12096                                                              stmt_info);
12097                     }
12098                 }
12099               gsi_next (&si);
12100               if (seen_store)
12101                 {
12102                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
12103                     /* Interleaving.  If IS_STORE is TRUE, the
12104                        vectorization of the interleaving chain was
12105                        completed - free all the stores in the chain.  */
12106                     vect_remove_stores (loop_vinfo,
12107                                         DR_GROUP_FIRST_ELEMENT (seen_store));
12108                   else
12109                     /* Free the attached stmt_vec_info and remove the stmt.  */
12110                     loop_vinfo->remove_stmt (stmt_info);
12111                 }
12112             }
12113         }
12114
12115       /* Stub out scalar statements that must not survive vectorization.
12116          Doing this here helps with grouped statements, or statements that
12117          are involved in patterns.  */
12118       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
12119            !gsi_end_p (gsi); gsi_next (&gsi))
12120         {
12121           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
12122           if (!call || !gimple_call_internal_p (call))
12123             continue;
12124           internal_fn ifn = gimple_call_internal_fn (call);
12125           if (ifn == IFN_MASK_LOAD)
12126             {
12127               tree lhs = gimple_get_lhs (call);
12128               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12129                 {
12130                   tree zero = build_zero_cst (TREE_TYPE (lhs));
12131                   gimple *new_stmt = gimple_build_assign (lhs, zero);
12132                   gsi_replace (&gsi, new_stmt, true);
12133                 }
12134             }
12135           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
12136             {
12137               tree lhs = gimple_get_lhs (call);
12138               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12139                 {
12140                   tree else_arg
12141                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
12142                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
12143                   gsi_replace (&gsi, new_stmt, true);
12144                 }
12145             }
12146         }
12147     }                           /* BBs in loop */
12148
12149   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
12150      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
12151   if (integer_onep (step_vector))
12152     niters_no_overflow = true;
12153   vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
12154                            niters_vector, step_vector, niters_vector_mult_vf,
12155                            !niters_no_overflow);
12156
12157   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
12158
12159   /* True if the final iteration might not handle a full vector's
12160      worth of scalar iterations.  */
12161   bool final_iter_may_be_partial
12162     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
12163   /* The minimum number of iterations performed by the epilogue.  This
12164      is 1 when peeling for gaps because we always need a final scalar
12165      iteration.  */
12166   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
12167   /* +1 to convert latch counts to loop iteration counts,
12168      -min_epilogue_iters to remove iterations that cannot be performed
12169        by the vector code.  */
12170   int bias_for_lowest = 1 - min_epilogue_iters;
12171   int bias_for_assumed = bias_for_lowest;
12172   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
12173   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
12174     {
12175       /* When the amount of peeling is known at compile time, the first
12176          iteration will have exactly alignment_npeels active elements.
12177          In the worst case it will have at least one.  */
12178       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
12179       bias_for_lowest += lowest_vf - min_first_active;
12180       bias_for_assumed += assumed_vf - min_first_active;
12181     }
12182   /* In these calculations the "- 1" converts loop iteration counts
12183      back to latch counts.  */
12184   if (loop->any_upper_bound)
12185     {
12186       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
12187       loop->nb_iterations_upper_bound
12188         = (final_iter_may_be_partial
12189            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
12190                             lowest_vf) - 1
12191            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
12192                              lowest_vf) - 1);
12193       if (main_vinfo
12194           /* Both peeling for alignment and peeling for gaps can end up
12195              with the scalar epilogue running for more than VF-1 iterations.  */
12196           && !main_vinfo->peeling_for_alignment
12197           && !main_vinfo->peeling_for_gaps)
12198         {
12199           unsigned int bound;
12200           poly_uint64 main_iters
12201             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
12202                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
12203           main_iters
12204             = upper_bound (main_iters,
12205                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
12206           if (can_div_away_from_zero_p (main_iters,
12207                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
12208                                         &bound))
12209             loop->nb_iterations_upper_bound
12210               = wi::umin ((bound_wide_int) (bound - 1),
12211                           loop->nb_iterations_upper_bound);
12212       }
12213   }
12214   if (loop->any_likely_upper_bound)
12215     loop->nb_iterations_likely_upper_bound
12216       = (final_iter_may_be_partial
12217          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
12218                           + bias_for_lowest, lowest_vf) - 1
12219          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
12220                            + bias_for_lowest, lowest_vf) - 1);
12221   if (loop->any_estimate)
12222     loop->nb_iterations_estimate
12223       = (final_iter_may_be_partial
12224          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
12225                           assumed_vf) - 1
12226          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
12227                            assumed_vf) - 1);
12228   scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
12229                                assumed_vf, flat);
12230
12231   if (dump_enabled_p ())
12232     {
12233       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
12234         {
12235           dump_printf_loc (MSG_NOTE, vect_location,
12236                            "LOOP VECTORIZED\n");
12237           if (loop->inner)
12238             dump_printf_loc (MSG_NOTE, vect_location,
12239                              "OUTER LOOP VECTORIZED\n");
12240           dump_printf (MSG_NOTE, "\n");
12241         }
12242       else
12243         dump_printf_loc (MSG_NOTE, vect_location,
12244                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12245                          GET_MODE_NAME (loop_vinfo->vector_mode));
12246     }
12247
12248   /* Loops vectorized with a variable factor won't benefit from
12249      unrolling/peeling.  */
12250   if (!vf.is_constant ())
12251     {
12252       loop->unroll = 1;
12253       if (dump_enabled_p ())
12254         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
12255                          " variable-length vectorization factor\n");
12256     }
12257   /* Free SLP instances here because otherwise stmt reference counting
12258      won't work.  */
12259   slp_instance instance;
12260   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
12261     vect_free_slp_instance (instance);
12262   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
12263   /* Clear-up safelen field since its value is invalid after vectorization
12264      since vectorized loop can have loop-carried dependencies.  */
12265   loop->safelen = 0;
12266
12267   if (epilogue)
12268     {
12269       update_epilogue_loop_vinfo (epilogue, advance);
12270
12271       epilogue->simduid = loop->simduid;
12272       epilogue->force_vectorize = loop->force_vectorize;
12273       epilogue->dont_vectorize = false;
12274     }
12275
12276   return epilogue;
12277 }
12278
12279 /* The code below is trying to perform simple optimization - revert
12280    if-conversion for masked stores, i.e. if the mask of a store is zero
12281    do not perform it and all stored value producers also if possible.
12282    For example,
12283      for (i=0; i<n; i++)
12284        if (c[i])
12285         {
12286           p1[i] += 1;
12287           p2[i] = p3[i] +2;
12288         }
12289    this transformation will produce the following semi-hammock:
12290
12291    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12292      {
12293        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12294        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12295        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12296        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12297        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12298        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12299      }
12300 */
12301
12302 void
12303 optimize_mask_stores (class loop *loop)
12304 {
12305   basic_block *bbs = get_loop_body (loop);
12306   unsigned nbbs = loop->num_nodes;
12307   unsigned i;
12308   basic_block bb;
12309   class loop *bb_loop;
12310   gimple_stmt_iterator gsi;
12311   gimple *stmt;
12312   auto_vec<gimple *> worklist;
12313   auto_purge_vect_location sentinel;
12314
12315   vect_location = find_loop_location (loop);
12316   /* Pick up all masked stores in loop if any.  */
12317   for (i = 0; i < nbbs; i++)
12318     {
12319       bb = bbs[i];
12320       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12321            gsi_next (&gsi))
12322         {
12323           stmt = gsi_stmt (gsi);
12324           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12325             worklist.safe_push (stmt);
12326         }
12327     }
12328
12329   free (bbs);
12330   if (worklist.is_empty ())
12331     return;
12332
12333   /* Loop has masked stores.  */
12334   while (!worklist.is_empty ())
12335     {
12336       gimple *last, *last_store;
12337       edge e, efalse;
12338       tree mask;
12339       basic_block store_bb, join_bb;
12340       gimple_stmt_iterator gsi_to;
12341       tree vdef, new_vdef;
12342       gphi *phi;
12343       tree vectype;
12344       tree zero;
12345
12346       last = worklist.pop ();
12347       mask = gimple_call_arg (last, 2);
12348       bb = gimple_bb (last);
12349       /* Create then_bb and if-then structure in CFG, then_bb belongs to
12350          the same loop as if_bb.  It could be different to LOOP when two
12351          level loop-nest is vectorized and mask_store belongs to the inner
12352          one.  */
12353       e = split_block (bb, last);
12354       bb_loop = bb->loop_father;
12355       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12356       join_bb = e->dest;
12357       store_bb = create_empty_bb (bb);
12358       add_bb_to_loop (store_bb, bb_loop);
12359       e->flags = EDGE_TRUE_VALUE;
12360       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12361       /* Put STORE_BB to likely part.  */
12362       efalse->probability = profile_probability::likely ();
12363       e->probability = efalse->probability.invert ();
12364       store_bb->count = efalse->count ();
12365       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12366       if (dom_info_available_p (CDI_DOMINATORS))
12367         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12368       if (dump_enabled_p ())
12369         dump_printf_loc (MSG_NOTE, vect_location,
12370                          "Create new block %d to sink mask stores.",
12371                          store_bb->index);
12372       /* Create vector comparison with boolean result.  */
12373       vectype = TREE_TYPE (mask);
12374       zero = build_zero_cst (vectype);
12375       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12376       gsi = gsi_last_bb (bb);
12377       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12378       /* Create new PHI node for vdef of the last masked store:
12379          .MEM_2 = VDEF <.MEM_1>
12380          will be converted to
12381          .MEM.3 = VDEF <.MEM_1>
12382          and new PHI node will be created in join bb
12383          .MEM_2 = PHI <.MEM_1, .MEM_3>
12384       */
12385       vdef = gimple_vdef (last);
12386       new_vdef = make_ssa_name (gimple_vop (cfun), last);
12387       gimple_set_vdef (last, new_vdef);
12388       phi = create_phi_node (vdef, join_bb);
12389       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12390
12391       /* Put all masked stores with the same mask to STORE_BB if possible.  */
12392       while (true)
12393         {
12394           gimple_stmt_iterator gsi_from;
12395           gimple *stmt1 = NULL;
12396
12397           /* Move masked store to STORE_BB.  */
12398           last_store = last;
12399           gsi = gsi_for_stmt (last);
12400           gsi_from = gsi;
12401           /* Shift GSI to the previous stmt for further traversal.  */
12402           gsi_prev (&gsi);
12403           gsi_to = gsi_start_bb (store_bb);
12404           gsi_move_before (&gsi_from, &gsi_to);
12405           /* Setup GSI_TO to the non-empty block start.  */
12406           gsi_to = gsi_start_bb (store_bb);
12407           if (dump_enabled_p ())
12408             dump_printf_loc (MSG_NOTE, vect_location,
12409                              "Move stmt to created bb\n%G", last);
12410           /* Move all stored value producers if possible.  */
12411           while (!gsi_end_p (gsi))
12412             {
12413               tree lhs;
12414               imm_use_iterator imm_iter;
12415               use_operand_p use_p;
12416               bool res;
12417
12418               /* Skip debug statements.  */
12419               if (is_gimple_debug (gsi_stmt (gsi)))
12420                 {
12421                   gsi_prev (&gsi);
12422                   continue;
12423                 }
12424               stmt1 = gsi_stmt (gsi);
12425               /* Do not consider statements writing to memory or having
12426                  volatile operand.  */
12427               if (gimple_vdef (stmt1)
12428                   || gimple_has_volatile_ops (stmt1))
12429                 break;
12430               gsi_from = gsi;
12431               gsi_prev (&gsi);
12432               lhs = gimple_get_lhs (stmt1);
12433               if (!lhs)
12434                 break;
12435
12436               /* LHS of vectorized stmt must be SSA_NAME.  */
12437               if (TREE_CODE (lhs) != SSA_NAME)
12438                 break;
12439
12440               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12441                 {
12442                   /* Remove dead scalar statement.  */
12443                   if (has_zero_uses (lhs))
12444                     {
12445                       gsi_remove (&gsi_from, true);
12446                       continue;
12447                     }
12448                 }
12449
12450               /* Check that LHS does not have uses outside of STORE_BB.  */
12451               res = true;
12452               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12453                 {
12454                   gimple *use_stmt;
12455                   use_stmt = USE_STMT (use_p);
12456                   if (is_gimple_debug (use_stmt))
12457                     continue;
12458                   if (gimple_bb (use_stmt) != store_bb)
12459                     {
12460                       res = false;
12461                       break;
12462                     }
12463                 }
12464               if (!res)
12465                 break;
12466
12467               if (gimple_vuse (stmt1)
12468                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
12469                 break;
12470
12471               /* Can move STMT1 to STORE_BB.  */
12472               if (dump_enabled_p ())
12473                 dump_printf_loc (MSG_NOTE, vect_location,
12474                                  "Move stmt to created bb\n%G", stmt1);
12475               gsi_move_before (&gsi_from, &gsi_to);
12476               /* Shift GSI_TO for further insertion.  */
12477               gsi_prev (&gsi_to);
12478             }
12479           /* Put other masked stores with the same mask to STORE_BB.  */
12480           if (worklist.is_empty ()
12481               || gimple_call_arg (worklist.last (), 2) != mask
12482               || worklist.last () != stmt1)
12483             break;
12484           last = worklist.pop ();
12485         }
12486       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12487     }
12488 }
12489
12490 /* Decide whether it is possible to use a zero-based induction variable
12491    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
12492    the value that the induction variable must be able to hold in order
12493    to ensure that the rgroups eventually have no active vector elements.
12494    Return -1 otherwise.  */
12495
12496 widest_int
12497 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12498 {
12499   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12500   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12501   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12502
12503   /* Calculate the value that the induction variable must be able
12504      to hit in order to ensure that we end the loop with an all-false mask.
12505      This involves adding the maximum number of inactive trailing scalar
12506      iterations.  */
12507   widest_int iv_limit = -1;
12508   if (max_loop_iterations (loop, &iv_limit))
12509     {
12510       if (niters_skip)
12511         {
12512           /* Add the maximum number of skipped iterations to the
12513              maximum iteration count.  */
12514           if (TREE_CODE (niters_skip) == INTEGER_CST)
12515             iv_limit += wi::to_widest (niters_skip);
12516           else
12517             iv_limit += max_vf - 1;
12518         }
12519       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12520         /* Make a conservatively-correct assumption.  */
12521         iv_limit += max_vf - 1;
12522
12523       /* IV_LIMIT is the maximum number of latch iterations, which is also
12524          the maximum in-range IV value.  Round this value down to the previous
12525          vector alignment boundary and then add an extra full iteration.  */
12526       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12527       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12528     }
12529   return iv_limit;
12530 }
12531
12532 /* For the given rgroup_controls RGC, check whether an induction variable
12533    would ever hit a value that produces a set of all-false masks or zero
12534    lengths before wrapping around.  Return true if it's possible to wrap
12535    around before hitting the desirable value, otherwise return false.  */
12536
12537 bool
12538 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12539 {
12540   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12541
12542   if (iv_limit == -1)
12543     return true;
12544
12545   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12546   unsigned int compare_precision = TYPE_PRECISION (compare_type);
12547   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12548
12549   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12550     return true;
12551
12552   return false;
12553 }