gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2024 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "memmodel.h"
  36 #include "optabs.h"
  37 #include "diagnostic-core.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "cfganal.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop-niter.h"
  47 #include "tree-ssa-loop.h"
  48 #include "cfgloop.h"
  49 #include "tree-scalar-evolution.h"
  50 #include "tree-vectorizer.h"
  51 #include "gimple-fold.h"
  52 #include "cgraph.h"
  53 #include "tree-cfg.h"
  54 #include "tree-if-conv.h"
  55 #include "internal-fn.h"
  56 #include "tree-vector-builder.h"
  57 #include "vec-perm-indices.h"
  58 #include "tree-eh.h"
  59 #include "case-cfn-macros.h"
  60 #include "langhooks.h"
  61
  62 /* Loop Vectorization Pass.
  63
  64    This pass tries to vectorize loops.
  65
  66    For example, the vectorizer transforms the following simple loop:
  67
  68         short a[N]; short b[N]; short c[N]; int i;
  69
  70         for (i=0; i<N; i++){
  71           a[i] = b[i] + c[i];
  72         }
  73
  74    as if it was manually vectorized by rewriting the source code into:
  75
  76         typedef int __attribute__((mode(V8HI))) v8hi;
  77         short a[N];  short b[N]; short c[N];   int i;
  78         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  79         v8hi va, vb, vc;
  80
  81         for (i=0; i<N/8; i++){
  82           vb = pb[i];
  83           vc = pc[i];
  84           va = vb + vc;
  85           pa[i] = va;
  86         }
  87
  88         The main entry to this pass is vectorize_loops(), in which
  89    the vectorizer applies a set of analyses on a given set of loops,
  90    followed by the actual vectorization transformation for the loops that
  91    had successfully passed the analysis phase.
  92         Throughout this pass we make a distinction between two types of
  93    data: scalars (which are represented by SSA_NAMES), and memory references
  94    ("data-refs").  These two types of data require different handling both
  95    during analysis and transformation. The types of data-refs that the
  96    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  97    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  98    accesses are required to have a simple (consecutive) access pattern.
  99
 100    Analysis phase:
 101    ===============
 102         The driver for the analysis phase is vect_analyze_loop().
 103    It applies a set of analyses, some of which rely on the scalar evolution
 104    analyzer (scev) developed by Sebastian Pop.
 105
 106         During the analysis phase the vectorizer records some information
 107    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 108    loop, as well as general information about the loop as a whole, which is
 109    recorded in a "loop_vec_info" struct attached to each loop.
 110
 111    Transformation phase:
 112    =====================
 113         The loop transformation phase scans all the stmts in the loop, and
 114    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 115    the loop that needs to be vectorized.  It inserts the vector code sequence
 116    just before the scalar stmt S, and records a pointer to the vector code
 117    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 118    attached to S).  This pointer will be used for the vectorization of following
 119    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 120    otherwise, we rely on dead code elimination for removing it.
 121
 122         For example, say stmt S1 was vectorized into stmt VS1:
 123
 124    VS1: vb = px[i];
 125    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 126    S2:  a = b;
 127
 128    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 129    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 130    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 131    resulting sequence would be:
 132
 133    VS1: vb = px[i];
 134    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 135    VS2: va = vb;
 136    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 137
 138         Operands that are not SSA_NAMEs, are data-refs that appear in
 139    load/store operations (like 'x[i]' in S1), and are handled differently.
 140
 141    Target modeling:
 142    =================
 143         Currently the only target specific information that is used is the
 144    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 145    Targets that can support different sizes of vectors, for now will need
 146    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 147    flexibility will be added in the future.
 148
 149         Since we only vectorize operations which vector form can be
 150    expressed using existing tree codes, to verify that an operation is
 151    supported, the vectorizer checks the relevant optab at the relevant
 152    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 153    the value found is CODE_FOR_nothing, then there's no target support, and
 154    we can't vectorize the stmt.
 155
 156    For additional information on this project see:
 157    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 158 */
 159
 160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 161                                                 unsigned *);
 162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 163                                                bool *, bool *, bool);
 164
 165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 166    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 167    may already be set for general statements (not just data refs).  */
 168
 169 static opt_result
 170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 171                               bool vectype_maybe_set_p,
 172                               poly_uint64 *vf)
 173 {
 174   gimple *stmt = stmt_info->stmt;
 175
 176   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 177        && !STMT_VINFO_LIVE_P (stmt_info))
 178       || gimple_clobber_p (stmt))
 179     {
 180       if (dump_enabled_p ())
 181         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 182       return opt_result::success ();
 183     }
 184
 185   tree stmt_vectype, nunits_vectype;
 186   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 187                                                    &stmt_vectype,
 188                                                    &nunits_vectype);
 189   if (!res)
 190     return res;
 191
 192   if (stmt_vectype)
 193     {
 194       if (STMT_VINFO_VECTYPE (stmt_info))
 195         /* The only case when a vectype had been already set is for stmts
 196            that contain a data ref, or for "pattern-stmts" (stmts generated
 197            by the vectorizer to represent/replace a certain idiom).  */
 198         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 199                      || vectype_maybe_set_p)
 200                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 201       else
 202         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 203     }
 204
 205   if (nunits_vectype)
 206     vect_update_max_nunits (vf, nunits_vectype);
 207
 208   return opt_result::success ();
 209 }
 210
 211 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 212    types of STMT_INFO and all attached pattern statements and update
 213    the vectorization factor VF accordingly.  Return true on success
 214    or false if something prevented vectorization.  */
 215
 216 static opt_result
 217 vect_determine_vf_for_stmt (vec_info *vinfo,
 218                             stmt_vec_info stmt_info, poly_uint64 *vf)
 219 {
 220   if (dump_enabled_p ())
 221     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 222                      stmt_info->stmt);
 223   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 224   if (!res)
 225     return res;
 226
 227   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 228       && STMT_VINFO_RELATED_STMT (stmt_info))
 229     {
 230       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 231       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 232
 233       /* If a pattern statement has def stmts, analyze them too.  */
 234       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 235            !gsi_end_p (si); gsi_next (&si))
 236         {
 237           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 238           if (dump_enabled_p ())
 239             dump_printf_loc (MSG_NOTE, vect_location,
 240                              "==> examining pattern def stmt: %G",
 241                              def_stmt_info->stmt);
 242           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 243           if (!res)
 244             return res;
 245         }
 246
 247       if (dump_enabled_p ())
 248         dump_printf_loc (MSG_NOTE, vect_location,
 249                          "==> examining pattern statement: %G",
 250                          stmt_info->stmt);
 251       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 252       if (!res)
 253         return res;
 254     }
 255
 256   return opt_result::success ();
 257 }
 258
 259 /* Function vect_determine_vectorization_factor
 260
 261    Determine the vectorization factor (VF).  VF is the number of data elements
 262    that are operated upon in parallel in a single iteration of the vectorized
 263    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 264    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 265    elements can fit in a single vector register.
 266
 267    We currently support vectorization of loops in which all types operated upon
 268    are of the same size.  Therefore this function currently sets VF according to
 269    the size of the types operated upon, and fails if there are multiple sizes
 270    in the loop.
 271
 272    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 273    original loop:
 274         for (i=0; i<N; i++){
 275           a[i] = b[i] + c[i];
 276         }
 277
 278    vectorized loop:
 279         for (i=0; i<N; i+=VF){
 280           a[i:VF] = b[i:VF] + c[i:VF];
 281         }
 282 */
 283
 284 static opt_result
 285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 286 {
 287   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 288   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 289   unsigned nbbs = loop->num_nodes;
 290   poly_uint64 vectorization_factor = 1;
 291   tree scalar_type = NULL_TREE;
 292   gphi *phi;
 293   tree vectype;
 294   stmt_vec_info stmt_info;
 295   unsigned i;
 296
 297   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 298
 299   for (i = 0; i < nbbs; i++)
 300     {
 301       basic_block bb = bbs[i];
 302
 303       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 304            gsi_next (&si))
 305         {
 306           phi = si.phi ();
 307           stmt_info = loop_vinfo->lookup_stmt (phi);
 308           if (dump_enabled_p ())
 309             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 310                              (gimple *) phi);
 311
 312           gcc_assert (stmt_info);
 313
 314           if (STMT_VINFO_RELEVANT_P (stmt_info)
 315               || STMT_VINFO_LIVE_P (stmt_info))
 316             {
 317               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 318               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 319
 320               if (dump_enabled_p ())
 321                 dump_printf_loc (MSG_NOTE, vect_location,
 322                                  "get vectype for scalar type:  %T\n",
 323                                  scalar_type);
 324
 325               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 326               if (!vectype)
 327                 return opt_result::failure_at (phi,
 328                                                "not vectorized: unsupported "
 329                                                "data-type %T\n",
 330                                                scalar_type);
 331               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 332
 333               if (dump_enabled_p ())
 334                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 335                                  vectype);
 336
 337               if (dump_enabled_p ())
 338                 {
 339                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 340                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 341                   dump_printf (MSG_NOTE, "\n");
 342                 }
 343
 344               vect_update_max_nunits (&vectorization_factor, vectype);
 345             }
 346         }
 347
 348       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 349            gsi_next (&si))
 350         {
 351           if (is_gimple_debug (gsi_stmt (si)))
 352             continue;
 353           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 354           opt_result res
 355             = vect_determine_vf_for_stmt (loop_vinfo,
 356                                           stmt_info, &vectorization_factor);
 357           if (!res)
 358             return res;
 359         }
 360     }
 361
 362   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 363   if (dump_enabled_p ())
 364     {
 365       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 366       dump_dec (MSG_NOTE, vectorization_factor);
 367       dump_printf (MSG_NOTE, "\n");
 368     }
 369
 370   if (known_le (vectorization_factor, 1U))
 371     return opt_result::failure_at (vect_location,
 372                                    "not vectorized: unsupported data-type\n");
 373   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 374   return opt_result::success ();
 375 }
 376
 377
 378 /* Function vect_is_simple_iv_evolution.
 379
 380    FORNOW: A simple evolution of an induction variables in the loop is
 381    considered a polynomial evolution.  */
 382
 383 static bool
 384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 385                              tree * step)
 386 {
 387   tree init_expr;
 388   tree step_expr;
 389   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 390   basic_block bb;
 391
 392   /* When there is no evolution in this loop, the evolution function
 393      is not "simple".  */
 394   if (evolution_part == NULL_TREE)
 395     return false;
 396
 397   /* When the evolution is a polynomial of degree >= 2
 398      the evolution function is not "simple".  */
 399   if (tree_is_chrec (evolution_part))
 400     return false;
 401
 402   step_expr = evolution_part;
 403   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 404
 405   if (dump_enabled_p ())
 406     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 407                      step_expr, init_expr);
 408
 409   *init = init_expr;
 410   *step = step_expr;
 411
 412   if (TREE_CODE (step_expr) != INTEGER_CST
 413       && (TREE_CODE (step_expr) != SSA_NAME
 414           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 415               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 416           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 417               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 418                   || !flag_associative_math)))
 419       && (TREE_CODE (step_expr) != REAL_CST
 420           || !flag_associative_math))
 421     {
 422       if (dump_enabled_p ())
 423         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 424                          "step unknown.\n");
 425       return false;
 426     }
 427
 428   return true;
 429 }
 430
 431 /* Function vect_is_nonlinear_iv_evolution
 432
 433    Only support nonlinear induction for integer type
 434    1. neg
 435    2. mul by constant
 436    3. lshift/rshift by constant.
 437
 438    For neg induction, return a fake step as integer -1.  */
 439 static bool
 440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
 441                                 gphi* loop_phi_node, tree *init, tree *step)
 442 {
 443   tree init_expr, ev_expr, result, op1, op2;
 444   gimple* def;
 445
 446   if (gimple_phi_num_args (loop_phi_node) != 2)
 447     return false;
 448
 449   init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
 450   ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
 451
 452   /* Support nonlinear induction only for integer type.  */
 453   if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
 454     return false;
 455
 456   *init = init_expr;
 457   result = PHI_RESULT (loop_phi_node);
 458
 459   if (TREE_CODE (ev_expr) != SSA_NAME
 460       || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
 461       || !is_gimple_assign (def))
 462     return false;
 463
 464   enum tree_code t_code = gimple_assign_rhs_code (def);
 465   switch (t_code)
 466     {
 467     case NEGATE_EXPR:
 468       if (gimple_assign_rhs1 (def) != result)
 469         return false;
 470       *step = build_int_cst (TREE_TYPE (init_expr), -1);
 471       STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
 472       break;
 473
 474     case RSHIFT_EXPR:
 475     case LSHIFT_EXPR:
 476     case MULT_EXPR:
 477       op1 = gimple_assign_rhs1 (def);
 478       op2 = gimple_assign_rhs2 (def);
 479       if (TREE_CODE (op2) != INTEGER_CST
 480           || op1 != result)
 481         return false;
 482       *step = op2;
 483       if (t_code == LSHIFT_EXPR)
 484         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
 485       else if (t_code == RSHIFT_EXPR)
 486         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
 487       /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul.  */
 488       else
 489         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
 490       break;
 491
 492     default:
 493       return false;
 494     }
 495
 496   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
 497   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
 498
 499   return true;
 500 }
 501
 502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 503    what we are assuming is a double reduction.  For example, given
 504    a structure like this:
 505
 506       outer1:
 507         x_1 = PHI <x_4(outer2), ...>;
 508         ...
 509
 510       inner:
 511         x_2 = PHI <x_1(outer1), ...>;
 512         ...
 513         x_3 = ...;
 514         ...
 515
 516       outer2:
 517         x_4 = PHI <x_3(inner)>;
 518         ...
 519
 520    outer loop analysis would treat x_1 as a double reduction phi and
 521    this function would then return true for x_2.  */
 522
 523 static bool
 524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 525 {
 526   use_operand_p use_p;
 527   ssa_op_iter op_iter;
 528   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 529     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 530       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 531         return true;
 532   return false;
 533 }
 534
 535 /* Returns true if Phi is a first-order recurrence. A first-order
 536    recurrence is a non-reduction recurrence relation in which the value of
 537    the recurrence in the current loop iteration equals a value defined in
 538    the previous iteration.  */
 539
 540 static bool
 541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
 542                                    gphi *phi)
 543 {
 544   /* A nested cycle isn't vectorizable as first order recurrence.  */
 545   if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
 546     return false;
 547
 548   /* Ensure the loop latch definition is from within the loop.  */
 549   edge latch = loop_latch_edge (loop);
 550   tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
 551   if (TREE_CODE (ldef) != SSA_NAME
 552       || SSA_NAME_IS_DEFAULT_DEF (ldef)
 553       || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
 554       || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
 555     return false;
 556
 557   tree def = gimple_phi_result (phi);
 558
 559   /* Ensure every use_stmt of the phi node is dominated by the latch
 560      definition.  */
 561   imm_use_iterator imm_iter;
 562   use_operand_p use_p;
 563   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
 564     if (!is_gimple_debug (USE_STMT (use_p))
 565         && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
 566             || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
 567                                             USE_STMT (use_p))))
 568       return false;
 569
 570   /* First-order recurrence autovectorization needs shuffle vector.  */
 571   tree scalar_type = TREE_TYPE (def);
 572   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 573   if (!vectype)
 574     return false;
 575
 576   return true;
 577 }
 578
 579 /* Function vect_analyze_scalar_cycles_1.
 580
 581    Examine the cross iteration def-use cycles of scalar variables
 582    in LOOP.  LOOP_VINFO represents the loop that is now being
 583    considered for vectorization (can be LOOP, or an outer-loop
 584    enclosing LOOP).  SLP indicates there will be some subsequent
 585    slp analyses or not.  */
 586
 587 static void
 588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
 589                               bool slp)
 590 {
 591   basic_block bb = loop->header;
 592   tree init, step;
 593   auto_vec<stmt_vec_info, 64> worklist;
 594   gphi_iterator gsi;
 595   bool double_reduc, reduc_chain;
 596
 597   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 598
 599   /* First - identify all inductions.  Reduction detection assumes that all the
 600      inductions have been identified, therefore, this order must not be
 601      changed.  */
 602   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 603     {
 604       gphi *phi = gsi.phi ();
 605       tree access_fn = NULL;
 606       tree def = PHI_RESULT (phi);
 607       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 608
 609       if (dump_enabled_p ())
 610         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 611                          (gimple *) phi);
 612
 613       /* Skip virtual phi's.  The data dependences that are associated with
 614          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 615       if (virtual_operand_p (def))
 616         continue;
 617
 618       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 619
 620       /* Analyze the evolution function.  */
 621       access_fn = analyze_scalar_evolution (loop, def);
 622       if (access_fn)
 623         {
 624           STRIP_NOPS (access_fn);
 625           if (dump_enabled_p ())
 626             dump_printf_loc (MSG_NOTE, vect_location,
 627                              "Access function of PHI: %T\n", access_fn);
 628           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 629             = initial_condition_in_loop_num (access_fn, loop->num);
 630           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 631             = evolution_part_in_loop_num (access_fn, loop->num);
 632         }
 633
 634       if ((!access_fn
 635            || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 636            || !vect_is_simple_iv_evolution (loop->num, access_fn,
 637                                             &init, &step)
 638            || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 639                && TREE_CODE (step) != INTEGER_CST))
 640           /* Only handle nonlinear iv for same loop.  */
 641           && (LOOP_VINFO_LOOP (loop_vinfo) != loop
 642               || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
 643                                                   phi, &init, &step)))
 644         {
 645           worklist.safe_push (stmt_vinfo);
 646           continue;
 647         }
 648
 649       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 650                   != NULL_TREE);
 651       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 652
 653       if (dump_enabled_p ())
 654         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 655       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 656     }
 657
 658
 659   /* Second - identify all reductions and nested cycles.  */
 660   while (worklist.length () > 0)
 661     {
 662       stmt_vec_info stmt_vinfo = worklist.pop ();
 663       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 664       tree def = PHI_RESULT (phi);
 665
 666       if (dump_enabled_p ())
 667         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 668                          (gimple *) phi);
 669
 670       gcc_assert (!virtual_operand_p (def)
 671                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 672
 673       stmt_vec_info reduc_stmt_info
 674         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 675                                     &reduc_chain, slp);
 676       if (reduc_stmt_info)
 677         {
 678           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 679           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 680           if (double_reduc)
 681             {
 682               if (dump_enabled_p ())
 683                 dump_printf_loc (MSG_NOTE, vect_location,
 684                                  "Detected double reduction.\n");
 685
 686               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 687               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 688             }
 689           else
 690             {
 691               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 692                 {
 693                   if (dump_enabled_p ())
 694                     dump_printf_loc (MSG_NOTE, vect_location,
 695                                      "Detected vectorizable nested cycle.\n");
 696
 697                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 698                 }
 699               else
 700                 {
 701                   if (dump_enabled_p ())
 702                     dump_printf_loc (MSG_NOTE, vect_location,
 703                                      "Detected reduction.\n");
 704
 705                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 706                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 707                   /* Store the reduction cycles for possible vectorization in
 708                      loop-aware SLP if it was not detected as reduction
 709                      chain.  */
 710                   if (! reduc_chain)
 711                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 712                       (reduc_stmt_info);
 713                 }
 714             }
 715         }
 716       else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
 717         STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
 718       else
 719         if (dump_enabled_p ())
 720           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 721                            "Unknown def-use cycle pattern.\n");
 722     }
 723 }
 724
 725
 726 /* Function vect_analyze_scalar_cycles.
 727
 728    Examine the cross iteration def-use cycles of scalar variables, by
 729    analyzing the loop-header PHIs of scalar variables.  Classify each
 730    cycle as one of the following: invariant, induction, reduction, unknown.
 731    We do that for the loop represented by LOOP_VINFO, and also to its
 732    inner-loop, if exists.
 733    Examples for scalar cycles:
 734
 735    Example1: reduction:
 736
 737               loop1:
 738               for (i=0; i<N; i++)
 739                  sum += a[i];
 740
 741    Example2: induction:
 742
 743               loop2:
 744               for (i=0; i<N; i++)
 745                  a[i] = i;  */
 746
 747 static void
 748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
 749 {
 750   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 751
 752   vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
 753
 754   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 755      Reductions in such inner-loop therefore have different properties than
 756      the reductions in the nest that gets vectorized:
 757      1. When vectorized, they are executed in the same order as in the original
 758         scalar loop, so we can't change the order of computation when
 759         vectorizing them.
 760      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 761         current checks are too strict.  */
 762
 763   if (loop->inner)
 764     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
 765 }
 766
 767 /* Transfer group and reduction information from STMT_INFO to its
 768    pattern stmt.  */
 769
 770 static void
 771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 772 {
 773   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 774   stmt_vec_info stmtp;
 775   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 776               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 777   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 778   do
 779     {
 780       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 781       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 782                            == STMT_VINFO_DEF_TYPE (stmt_info));
 783       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 784       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 785       if (stmt_info)
 786         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 787           = STMT_VINFO_RELATED_STMT (stmt_info);
 788     }
 789   while (stmt_info);
 790 }
 791
 792 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 793
 794 static void
 795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 796 {
 797   stmt_vec_info first;
 798   unsigned i;
 799
 800   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 801     {
 802       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 803       while (next)
 804         {
 805           if ((STMT_VINFO_IN_PATTERN_P (next)
 806                != STMT_VINFO_IN_PATTERN_P (first))
 807               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 808             break;
 809           next = REDUC_GROUP_NEXT_ELEMENT (next);
 810         }
 811       /* If all reduction chain members are well-formed patterns adjust
 812          the group to group the pattern stmts instead.  */
 813       if (! next
 814           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 815         {
 816           if (STMT_VINFO_IN_PATTERN_P (first))
 817             {
 818               vect_fixup_reduc_chain (first);
 819               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 820                 = STMT_VINFO_RELATED_STMT (first);
 821             }
 822         }
 823       /* If not all stmt in the chain are patterns or if we failed
 824          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 825          it as regular reduction instead.  */
 826       else
 827         {
 828           stmt_vec_info vinfo = first;
 829           stmt_vec_info last = NULL;
 830           while (vinfo)
 831             {
 832               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 833               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 834               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 835               last = vinfo;
 836               vinfo = next;
 837             }
 838           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 839             = vect_internal_def;
 840           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 841           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 842           --i;
 843         }
 844     }
 845 }
 846
 847 /* Function vect_get_loop_niters.
 848
 849    Determine how many iterations the loop is executed and place it
 850    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 851    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 852    niter information holds in ASSUMPTIONS.
 853
 854    Return the loop exit conditions.  */
 855
 856
 857 static vec<gcond *>
 858 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
 859                       tree *number_of_iterations, tree *number_of_iterationsm1)
 860 {
 861   auto_vec<edge> exits = get_loop_exit_edges (loop);
 862   vec<gcond *> conds;
 863   conds.create (exits.length ());
 864   class tree_niter_desc niter_desc;
 865   tree niter_assumptions, niter, may_be_zero;
 866
 867   *assumptions = boolean_true_node;
 868   *number_of_iterationsm1 = chrec_dont_know;
 869   *number_of_iterations = chrec_dont_know;
 870
 871   DUMP_VECT_SCOPE ("get_loop_niters");
 872
 873   if (exits.is_empty ())
 874     return conds;
 875
 876   if (dump_enabled_p ())
 877     dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
 878                      exits.length ());
 879
 880   edge exit;
 881   unsigned int i;
 882   FOR_EACH_VEC_ELT (exits, i, exit)
 883     {
 884       gcond *cond = get_loop_exit_condition (exit);
 885       if (cond)
 886         conds.safe_push (cond);
 887
 888       if (dump_enabled_p ())
 889         dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
 890
 891       if (exit != main_exit)
 892         continue;
 893
 894       may_be_zero = NULL_TREE;
 895       if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 896           || chrec_contains_undetermined (niter_desc.niter))
 897         continue;
 898
 899       niter_assumptions = niter_desc.assumptions;
 900       may_be_zero = niter_desc.may_be_zero;
 901       niter = niter_desc.niter;
 902
 903       if (may_be_zero && integer_zerop (may_be_zero))
 904         may_be_zero = NULL_TREE;
 905
 906       if (may_be_zero)
 907         {
 908           if (COMPARISON_CLASS_P (may_be_zero))
 909             {
 910               /* Try to combine may_be_zero with assumptions, this can simplify
 911                  computation of niter expression.  */
 912               if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 913                 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 914                                                  niter_assumptions,
 915                                                  fold_build1 (TRUTH_NOT_EXPR,
 916                                                               boolean_type_node,
 917                                                               may_be_zero));
 918               else
 919                 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 920                                      build_int_cst (TREE_TYPE (niter), 0),
 921                                      rewrite_to_non_trapping_overflow (niter));
 922
 923               may_be_zero = NULL_TREE;
 924             }
 925           else if (integer_nonzerop (may_be_zero))
 926             {
 927               *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 928               *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 929               continue;
 930             }
 931           else
 932             continue;
 933        }
 934
 935       /* Loop assumptions are based off the normal exit.  */
 936       *assumptions = niter_assumptions;
 937       *number_of_iterationsm1 = niter;
 938
 939       /* We want the number of loop header executions which is the number
 940          of latch executions plus one.
 941          ???  For UINT_MAX latch executions this number overflows to zero
 942          for loops like do { n++; } while (n != 0);  */
 943       if (niter && !chrec_contains_undetermined (niter))
 944         {
 945           niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
 946                                unshare_expr (niter),
 947                                build_int_cst (TREE_TYPE (niter), 1));
 948           if (TREE_CODE (niter) == INTEGER_CST
 949               && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
 950             {
 951               /* If we manage to fold niter + 1 into INTEGER_CST even when
 952                  niter is some complex expression, ensure back
 953                  *number_of_iterationsm1 is an INTEGER_CST as well.  See
 954                  PR113210.  */
 955               *number_of_iterationsm1
 956                 = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
 957                                build_minus_one_cst (TREE_TYPE (niter)));
 958             }
 959         }
 960       *number_of_iterations = niter;
 961     }
 962
 963   if (dump_enabled_p ())
 964     dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
 965
 966   return conds;
 967 }
 968
 969 /*  Determine the main loop exit for the vectorizer.  */
 970
 971 edge
 972 vec_init_loop_exit_info (class loop *loop)
 973 {
 974   /* Before we begin we must first determine which exit is the main one and
 975      which are auxilary exits.  */
 976   auto_vec<edge> exits = get_loop_exit_edges (loop);
 977   if (exits.length () == 1)
 978     return exits[0];
 979
 980   /* If we have multiple exits we only support counting IV at the moment.  Analyze
 981      all exits and return one */
 982   class tree_niter_desc niter_desc;
 983   edge candidate = NULL;
 984   for (edge exit : exits)
 985     {
 986       if (!get_loop_exit_condition (exit))
 987         continue;
 988
 989       if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 990           && !chrec_contains_undetermined (niter_desc.niter))
 991         {
 992           tree may_be_zero = niter_desc.may_be_zero;
 993           if (integer_zerop (may_be_zero)
 994               && (!candidate
 995                   || dominated_by_p (CDI_DOMINATORS, exit->src,
 996                                      candidate->src)))
 997             candidate = exit;
 998         }
 999     }
1000
1001   return candidate;
1002 }
1003
1004 /* Function bb_in_loop_p
1005
1006    Used as predicate for dfs order traversal of the loop bbs.  */
1007
1008 static bool
1009 bb_in_loop_p (const_basic_block bb, const void *data)
1010 {
1011   const class loop *const loop = (const class loop *)data;
1012   if (flow_bb_inside_loop_p (loop, bb))
1013     return true;
1014   return false;
1015 }
1016
1017
1018 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1019    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1020
1021 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1022   : vec_info (vec_info::loop, shared),
1023     loop (loop_in),
1024     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1025     num_itersm1 (NULL_TREE),
1026     num_iters (NULL_TREE),
1027     num_iters_unchanged (NULL_TREE),
1028     num_iters_assumptions (NULL_TREE),
1029     vector_costs (nullptr),
1030     scalar_costs (nullptr),
1031     th (0),
1032     versioning_threshold (0),
1033     vectorization_factor (0),
1034     main_loop_edge (nullptr),
1035     skip_main_loop_edge (nullptr),
1036     skip_this_loop_edge (nullptr),
1037     reusable_accumulators (),
1038     suggested_unroll_factor (1),
1039     max_vectorization_factor (0),
1040     mask_skip_niters (NULL_TREE),
1041     rgroup_compare_type (NULL_TREE),
1042     simd_if_cond (NULL_TREE),
1043     partial_vector_style (vect_partial_vectors_none),
1044     unaligned_dr (NULL),
1045     peeling_for_alignment (0),
1046     ptr_mask (0),
1047     ivexpr_map (NULL),
1048     scan_map (NULL),
1049     slp_unrolling_factor (1),
1050     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1051     vectorizable (false),
1052     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1053     using_partial_vectors_p (false),
1054     using_decrementing_iv_p (false),
1055     using_select_vl_p (false),
1056     epil_using_partial_vectors_p (false),
1057     partial_load_store_bias (0),
1058     peeling_for_gaps (false),
1059     peeling_for_niter (false),
1060     early_breaks (false),
1061     no_data_dependencies (false),
1062     has_mask_store (false),
1063     scalar_loop_scaling (profile_probability::uninitialized ()),
1064     scalar_loop (NULL),
1065     orig_loop_info (NULL),
1066     vec_loop_iv_exit (NULL),
1067     vec_epilogue_loop_iv_exit (NULL),
1068     scalar_loop_iv_exit (NULL)
1069 {
1070   /* CHECKME: We want to visit all BBs before their successors (except for
1071      latch blocks, for which this assertion wouldn't hold).  In the simple
1072      case of the loop forms we allow, a dfs order of the BBs would the same
1073      as reversed postorder traversal, so we are safe.  */
1074
1075   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1076                                           bbs, loop->num_nodes, loop);
1077   gcc_assert (nbbs == loop->num_nodes);
1078
1079   for (unsigned int i = 0; i < nbbs; i++)
1080     {
1081       basic_block bb = bbs[i];
1082       gimple_stmt_iterator si;
1083
1084       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1085         {
1086           gimple *phi = gsi_stmt (si);
1087           gimple_set_uid (phi, 0);
1088           add_stmt (phi);
1089         }
1090
1091       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1092         {
1093           gimple *stmt = gsi_stmt (si);
1094           gimple_set_uid (stmt, 0);
1095           if (is_gimple_debug (stmt))
1096             continue;
1097           add_stmt (stmt);
1098           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1099              third argument is the #pragma omp simd if (x) condition, when 0,
1100              loop shouldn't be vectorized, when non-zero constant, it should
1101              be vectorized normally, otherwise versioned with vectorized loop
1102              done if the condition is non-zero at runtime.  */
1103           if (loop_in->simduid
1104               && is_gimple_call (stmt)
1105               && gimple_call_internal_p (stmt)
1106               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1107               && gimple_call_num_args (stmt) >= 3
1108               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1109               && (loop_in->simduid
1110                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1111             {
1112               tree arg = gimple_call_arg (stmt, 2);
1113               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1114                 simd_if_cond = arg;
1115               else
1116                 gcc_assert (integer_nonzerop (arg));
1117             }
1118         }
1119     }
1120
1121   epilogue_vinfos.create (6);
1122 }
1123
1124 /* Free all levels of rgroup CONTROLS.  */
1125
1126 void
1127 release_vec_loop_controls (vec<rgroup_controls> *controls)
1128 {
1129   rgroup_controls *rgc;
1130   unsigned int i;
1131   FOR_EACH_VEC_ELT (*controls, i, rgc)
1132     rgc->controls.release ();
1133   controls->release ();
1134 }
1135
1136 /* Free all memory used by the _loop_vec_info, as well as all the
1137    stmt_vec_info structs of all the stmts in the loop.  */
1138
1139 _loop_vec_info::~_loop_vec_info ()
1140 {
1141   free (bbs);
1142
1143   release_vec_loop_controls (&masks.rgc_vec);
1144   release_vec_loop_controls (&lens);
1145   delete ivexpr_map;
1146   delete scan_map;
1147   epilogue_vinfos.release ();
1148   delete scalar_costs;
1149   delete vector_costs;
1150
1151   /* When we release an epiloge vinfo that we do not intend to use
1152      avoid clearing AUX of the main loop which should continue to
1153      point to the main loop vinfo since otherwise we'll leak that.  */
1154   if (loop->aux == this)
1155     loop->aux = NULL;
1156 }
1157
1158 /* Return an invariant or register for EXPR and emit necessary
1159    computations in the LOOP_VINFO loop preheader.  */
1160
1161 tree
1162 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1163 {
1164   if (is_gimple_reg (expr)
1165       || is_gimple_min_invariant (expr))
1166     return expr;
1167
1168   if (! loop_vinfo->ivexpr_map)
1169     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1170   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1171   if (! cached)
1172     {
1173       gimple_seq stmts = NULL;
1174       cached = force_gimple_operand (unshare_expr (expr),
1175                                      &stmts, true, NULL_TREE);
1176       if (stmts)
1177         {
1178           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1179           gsi_insert_seq_on_edge_immediate (e, stmts);
1180         }
1181     }
1182   return cached;
1183 }
1184
1185 /* Return true if we can use CMP_TYPE as the comparison type to produce
1186    all masks required to mask LOOP_VINFO.  */
1187
1188 static bool
1189 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1190 {
1191   rgroup_controls *rgm;
1192   unsigned int i;
1193   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1194     if (rgm->type != NULL_TREE
1195         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1196                                             cmp_type, rgm->type,
1197                                             OPTIMIZE_FOR_SPEED))
1198       return false;
1199   return true;
1200 }
1201
1202 /* Calculate the maximum number of scalars per iteration for every
1203    rgroup in LOOP_VINFO.  */
1204
1205 static unsigned int
1206 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1207 {
1208   unsigned int res = 1;
1209   unsigned int i;
1210   rgroup_controls *rgm;
1211   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1212     res = MAX (res, rgm->max_nscalars_per_iter);
1213   return res;
1214 }
1215
1216 /* Calculate the minimum precision necessary to represent:
1217
1218       MAX_NITERS * FACTOR
1219
1220    as an unsigned integer, where MAX_NITERS is the maximum number of
1221    loop header iterations for the original scalar form of LOOP_VINFO.  */
1222
1223 static unsigned
1224 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1225 {
1226   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1227
1228   /* Get the maximum number of iterations that is representable
1229      in the counter type.  */
1230   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1231   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1232
1233   /* Get a more refined estimate for the number of iterations.  */
1234   widest_int max_back_edges;
1235   if (max_loop_iterations (loop, &max_back_edges))
1236     max_ni = wi::smin (max_ni, max_back_edges + 1);
1237
1238   /* Work out how many bits we need to represent the limit.  */
1239   return wi::min_precision (max_ni * factor, UNSIGNED);
1240 }
1241
1242 /* True if the loop needs peeling or partial vectors when vectorized.  */
1243
1244 static bool
1245 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1246 {
1247   unsigned HOST_WIDE_INT const_vf;
1248   HOST_WIDE_INT max_niter
1249     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1250
1251   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1252   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1253     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1254                                           (loop_vinfo));
1255
1256   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1257       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1258     {
1259       /* Work out the (constant) number of iterations that need to be
1260          peeled for reasons other than niters.  */
1261       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1262       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1263         peel_niter += 1;
1264       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1265                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1266         return true;
1267     }
1268   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1269       /* ??? When peeling for gaps but not alignment, we could
1270          try to check whether the (variable) niters is known to be
1271          VF * N + 1.  That's something of a niche case though.  */
1272       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1273       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1274       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1275            < (unsigned) exact_log2 (const_vf))
1276           /* In case of versioning, check if the maximum number of
1277              iterations is greater than th.  If they are identical,
1278              the epilogue is unnecessary.  */
1279           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1280               || ((unsigned HOST_WIDE_INT) max_niter
1281                   /* We'd like to use LOOP_VINFO_VERSIONING_THRESHOLD
1282                      but that's only computed later based on our result.
1283                      The following is the most conservative approximation.  */
1284                   > (std::max ((unsigned HOST_WIDE_INT) th,
1285                                const_vf) / const_vf) * const_vf))))
1286     return true;
1287
1288   return false;
1289 }
1290
1291 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1292    whether we can actually generate the masks required.  Return true if so,
1293    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1294
1295 static bool
1296 vect_verify_full_masking (loop_vec_info loop_vinfo)
1297 {
1298   unsigned int min_ni_width;
1299
1300   /* Use a normal loop if there are no statements that need masking.
1301      This only happens in rare degenerate cases: it means that the loop
1302      has no loads, no stores, and no live-out values.  */
1303   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1304     return false;
1305
1306   /* Produce the rgroup controls.  */
1307   for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1308     {
1309       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1310       tree vectype = mask.first;
1311       unsigned nvectors = mask.second;
1312
1313       if (masks->rgc_vec.length () < nvectors)
1314         masks->rgc_vec.safe_grow_cleared (nvectors, true);
1315       rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1316       /* The number of scalars per iteration and the number of vectors are
1317          both compile-time constants.  */
1318       unsigned int nscalars_per_iter
1319           = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1320                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1321
1322       if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1323         {
1324           rgm->max_nscalars_per_iter = nscalars_per_iter;
1325           rgm->type = truth_type_for (vectype);
1326           rgm->factor = 1;
1327         }
1328     }
1329
1330   unsigned int max_nscalars_per_iter
1331     = vect_get_max_nscalars_per_iter (loop_vinfo);
1332
1333   /* Work out how many bits we need to represent the limit.  */
1334   min_ni_width
1335     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1336
1337   /* Find a scalar mode for which WHILE_ULT is supported.  */
1338   opt_scalar_int_mode cmp_mode_iter;
1339   tree cmp_type = NULL_TREE;
1340   tree iv_type = NULL_TREE;
1341   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1342   unsigned int iv_precision = UINT_MAX;
1343
1344   if (iv_limit != -1)
1345     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1346                                       UNSIGNED);
1347
1348   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1349     {
1350       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1351       if (cmp_bits >= min_ni_width
1352           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1353         {
1354           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1355           if (this_type
1356               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1357             {
1358               /* Although we could stop as soon as we find a valid mode,
1359                  there are at least two reasons why that's not always the
1360                  best choice:
1361
1362                  - An IV that's Pmode or wider is more likely to be reusable
1363                    in address calculations than an IV that's narrower than
1364                    Pmode.
1365
1366                  - Doing the comparison in IV_PRECISION or wider allows
1367                    a natural 0-based IV, whereas using a narrower comparison
1368                    type requires mitigations against wrap-around.
1369
1370                  Conversely, if the IV limit is variable, doing the comparison
1371                  in a wider type than the original type can introduce
1372                  unnecessary extensions, so picking the widest valid mode
1373                  is not always a good choice either.
1374
1375                  Here we prefer the first IV type that's Pmode or wider,
1376                  and the first comparison type that's IV_PRECISION or wider.
1377                  (The comparison type must be no wider than the IV type,
1378                  to avoid extensions in the vector loop.)
1379
1380                  ??? We might want to try continuing beyond Pmode for ILP32
1381                  targets if CMP_BITS < IV_PRECISION.  */
1382               iv_type = this_type;
1383               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1384                 cmp_type = this_type;
1385               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1386                 break;
1387             }
1388         }
1389     }
1390
1391   if (!cmp_type)
1392     {
1393       LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1394       return false;
1395     }
1396
1397   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1398   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1399   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1400   return true;
1401 }
1402
1403 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1404    whether we can actually generate AVX512 style masks.  Return true if so,
1405    storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE.  */
1406
1407 static bool
1408 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1409 {
1410   /* Produce differently organized rgc_vec and differently check
1411      we can produce masks.  */
1412
1413   /* Use a normal loop if there are no statements that need masking.
1414      This only happens in rare degenerate cases: it means that the loop
1415      has no loads, no stores, and no live-out values.  */
1416   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1417     return false;
1418
1419   /* For the decrementing IV we need to represent all values in
1420      [0, niter + niter_skip] where niter_skip is the elements we
1421      skip in the first iteration for prologue peeling.  */
1422   tree iv_type = NULL_TREE;
1423   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1424   unsigned int iv_precision = UINT_MAX;
1425   if (iv_limit != -1)
1426     iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1427
1428   /* First compute the type for the IV we use to track the remaining
1429      scalar iterations.  */
1430   opt_scalar_int_mode cmp_mode_iter;
1431   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1432     {
1433       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1434       if (cmp_bits >= iv_precision
1435           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1436         {
1437           iv_type = build_nonstandard_integer_type (cmp_bits, true);
1438           if (iv_type)
1439             break;
1440         }
1441     }
1442   if (!iv_type)
1443     return false;
1444
1445   /* Produce the rgroup controls.  */
1446   for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1447     {
1448       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1449       tree vectype = mask.first;
1450       unsigned nvectors = mask.second;
1451
1452       /* The number of scalars per iteration and the number of vectors are
1453          both compile-time constants.  */
1454       unsigned int nscalars_per_iter
1455         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1456                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1457
1458       /* We index the rgroup_controls vector with nscalars_per_iter
1459          which we keep constant and instead have a varying nvectors,
1460          remembering the vector mask with the fewest nV.  */
1461       if (masks->rgc_vec.length () < nscalars_per_iter)
1462         masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1463       rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1464
1465       if (!rgm->type || rgm->factor > nvectors)
1466         {
1467           rgm->type = truth_type_for (vectype);
1468           rgm->compare_type = NULL_TREE;
1469           rgm->max_nscalars_per_iter = nscalars_per_iter;
1470           rgm->factor = nvectors;
1471           rgm->bias_adjusted_ctrl = NULL_TREE;
1472         }
1473     }
1474
1475   /* There is no fixed compare type we are going to use but we have to
1476      be able to get at one for each mask group.  */
1477   unsigned int min_ni_width
1478     = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1479
1480   bool ok = true;
1481   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1482     {
1483       tree mask_type = rgc.type;
1484       if (!mask_type)
1485         continue;
1486
1487       /* For now vect_get_loop_mask only supports integer mode masks
1488          when we need to split it.  */
1489       if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1490           || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1491         {
1492           ok = false;
1493           break;
1494         }
1495
1496       /* If iv_type is usable as compare type use that - we can elide the
1497          saturation in that case.   */
1498       if (TYPE_PRECISION (iv_type) >= min_ni_width)
1499         {
1500           tree cmp_vectype
1501             = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1502           if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1503             rgc.compare_type = cmp_vectype;
1504         }
1505       if (!rgc.compare_type)
1506         FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1507           {
1508             unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1509             if (cmp_bits >= min_ni_width
1510                 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1511               {
1512                 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1513                 if (!cmp_type)
1514                   continue;
1515
1516                 /* Check whether we can produce the mask with cmp_type.  */
1517                 tree cmp_vectype
1518                   = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1519                 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1520                   {
1521                     rgc.compare_type = cmp_vectype;
1522                     break;
1523                   }
1524               }
1525         }
1526       if (!rgc.compare_type)
1527         {
1528           ok = false;
1529           break;
1530         }
1531     }
1532   if (!ok)
1533     {
1534       release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1535       return false;
1536     }
1537
1538   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1539   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1540   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1541   return true;
1542 }
1543
1544 /* Check whether we can use vector access with length based on precison
1545    comparison.  So far, to keep it simple, we only allow the case that the
1546    precision of the target supported length is larger than the precision
1547    required by loop niters.  */
1548
1549 static bool
1550 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1551 {
1552   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1553     return false;
1554
1555   machine_mode len_load_mode, len_store_mode;
1556   if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1557          .exists (&len_load_mode))
1558     return false;
1559   if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1560          .exists (&len_store_mode))
1561     return false;
1562
1563   signed char partial_load_bias = internal_len_load_store_bias
1564     (IFN_LEN_LOAD, len_load_mode);
1565
1566   signed char partial_store_bias = internal_len_load_store_bias
1567     (IFN_LEN_STORE, len_store_mode);
1568
1569   gcc_assert (partial_load_bias == partial_store_bias);
1570
1571   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1572     return false;
1573
1574   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1575      len_loads with a length of zero.  In order to avoid that we prohibit
1576      more than one loop length here.  */
1577   if (partial_load_bias == -1
1578       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1579     return false;
1580
1581   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1582
1583   unsigned int max_nitems_per_iter = 1;
1584   unsigned int i;
1585   rgroup_controls *rgl;
1586   /* Find the maximum number of items per iteration for every rgroup.  */
1587   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1588     {
1589       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1590       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1591     }
1592
1593   /* Work out how many bits we need to represent the length limit.  */
1594   unsigned int min_ni_prec
1595     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1596
1597   /* Now use the maximum of below precisions for one suitable IV type:
1598      - the IV's natural precision
1599      - the precision needed to hold: the maximum number of scalar
1600        iterations multiplied by the scale factor (min_ni_prec above)
1601      - the Pmode precision
1602
1603      If min_ni_prec is less than the precision of the current niters,
1604      we perfer to still use the niters type.  Prefer to use Pmode and
1605      wider IV to avoid narrow conversions.  */
1606
1607   unsigned int ni_prec
1608     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1609   min_ni_prec = MAX (min_ni_prec, ni_prec);
1610   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1611
1612   tree iv_type = NULL_TREE;
1613   opt_scalar_int_mode tmode_iter;
1614   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1615     {
1616       scalar_mode tmode = tmode_iter.require ();
1617       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1618
1619       /* ??? Do we really want to construct one IV whose precision exceeds
1620          BITS_PER_WORD?  */
1621       if (tbits > BITS_PER_WORD)
1622         break;
1623
1624       /* Find the first available standard integral type.  */
1625       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1626         {
1627           iv_type = build_nonstandard_integer_type (tbits, true);
1628           break;
1629         }
1630     }
1631
1632   if (!iv_type)
1633     {
1634       if (dump_enabled_p ())
1635         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1636                          "can't vectorize with length-based partial vectors"
1637                          " because there is no suitable iv type.\n");
1638       return false;
1639     }
1640
1641   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1642   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1643   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1644
1645   return true;
1646 }
1647
1648 /* Calculate the cost of one scalar iteration of the loop.  */
1649 static void
1650 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1651 {
1652   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1653   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1654   int nbbs = loop->num_nodes, factor;
1655   int innerloop_iters, i;
1656
1657   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1658
1659   /* Gather costs for statements in the scalar loop.  */
1660
1661   /* FORNOW.  */
1662   innerloop_iters = 1;
1663   if (loop->inner)
1664     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1665
1666   for (i = 0; i < nbbs; i++)
1667     {
1668       gimple_stmt_iterator si;
1669       basic_block bb = bbs[i];
1670
1671       if (bb->loop_father == loop->inner)
1672         factor = innerloop_iters;
1673       else
1674         factor = 1;
1675
1676       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1677         {
1678           gimple *stmt = gsi_stmt (si);
1679           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1680
1681           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1682             continue;
1683
1684           /* Skip stmts that are not vectorized inside the loop.  */
1685           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1686           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1687               && (!STMT_VINFO_LIVE_P (vstmt_info)
1688                   || !VECTORIZABLE_CYCLE_DEF
1689                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1690             continue;
1691
1692           vect_cost_for_stmt kind;
1693           if (STMT_VINFO_DATA_REF (stmt_info))
1694             {
1695               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1696                kind = scalar_load;
1697              else
1698                kind = scalar_store;
1699             }
1700           else if (vect_nop_conversion_p (stmt_info))
1701             continue;
1702           else
1703             kind = scalar_stmt;
1704
1705           /* We are using vect_prologue here to avoid scaling twice
1706              by the inner loop factor.  */
1707           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1708                             factor, kind, stmt_info, 0, vect_prologue);
1709         }
1710     }
1711
1712   /* Now accumulate cost.  */
1713   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1714   add_stmt_costs (loop_vinfo->scalar_costs,
1715                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1716   loop_vinfo->scalar_costs->finish_cost (nullptr);
1717 }
1718
1719 /* Function vect_analyze_loop_form.
1720
1721    Verify that certain CFG restrictions hold, including:
1722    - the loop has a pre-header
1723    - the loop has a single entry
1724    - nested loops can have only a single exit.
1725    - the loop exit condition is simple enough
1726    - the number of iterations can be analyzed, i.e, a countable loop.  The
1727      niter could be analyzed under some assumptions.  */
1728
1729 opt_result
1730 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1731 {
1732   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1733
1734   edge exit_e = vec_init_loop_exit_info (loop);
1735   if (!exit_e)
1736     return opt_result::failure_at (vect_location,
1737                                    "not vectorized:"
1738                                    " could not determine main exit from"
1739                                    " loop with multiple exits.\n");
1740   info->loop_exit = exit_e;
1741   if (dump_enabled_p ())
1742       dump_printf_loc (MSG_NOTE, vect_location,
1743                        "using as main loop exit: %d -> %d [AUX: %p]\n",
1744                        exit_e->src->index, exit_e->dest->index, exit_e->aux);
1745
1746   /* Check if we have any control flow that doesn't leave the loop.  */
1747   class loop *v_loop = loop->inner ? loop->inner : loop;
1748   basic_block *bbs= get_loop_body (v_loop);
1749   for (unsigned i = 0; i < v_loop->num_nodes; i++)
1750     if (EDGE_COUNT (bbs[i]->succs) != 1
1751         && (EDGE_COUNT (bbs[i]->succs) != 2
1752             || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1753       return opt_result::failure_at (vect_location,
1754                                      "not vectorized:"
1755                                      " unsupported control flow in loop.\n");
1756
1757   /* Different restrictions apply when we are considering an inner-most loop,
1758      vs. an outer (nested) loop.
1759      (FORNOW. May want to relax some of these restrictions in the future).  */
1760
1761   info->inner_loop_cond = NULL;
1762   if (!loop->inner)
1763     {
1764       /* Inner-most loop.  We currently require that the number of BBs is
1765          exactly 2 (the header and latch).  Vectorizable inner-most loops
1766          look like this:
1767
1768                         (pre-header)
1769                            |
1770                           header <--------+
1771                            | |            |
1772                            | +--> latch --+
1773                            |
1774                         (exit-bb)  */
1775
1776       if (empty_block_p (loop->header))
1777         return opt_result::failure_at (vect_location,
1778                                        "not vectorized: empty loop.\n");
1779     }
1780   else
1781     {
1782       class loop *innerloop = loop->inner;
1783       edge entryedge;
1784
1785       /* Nested loop. We currently require that the loop is doubly-nested,
1786          contains a single inner loop, and the number of BBs is exactly 5.
1787          Vectorizable outer-loops look like this:
1788
1789                         (pre-header)
1790                            |
1791                           header <---+
1792                            |         |
1793                           inner-loop |
1794                            |         |
1795                           tail ------+
1796                            |
1797                         (exit-bb)
1798
1799          The inner-loop has the properties expected of inner-most loops
1800          as described above.  */
1801
1802       if ((loop->inner)->inner || (loop->inner)->next)
1803         return opt_result::failure_at (vect_location,
1804                                        "not vectorized:"
1805                                        " multiple nested loops.\n");
1806
1807       entryedge = loop_preheader_edge (innerloop);
1808       if (entryedge->src != loop->header
1809           || !single_exit (innerloop)
1810           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1811         return opt_result::failure_at (vect_location,
1812                                        "not vectorized:"
1813                                        " unsupported outerloop form.\n");
1814
1815       /* Analyze the inner-loop.  */
1816       vect_loop_form_info inner;
1817       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1818       if (!res)
1819         {
1820           if (dump_enabled_p ())
1821             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1822                              "not vectorized: Bad inner loop.\n");
1823           return res;
1824         }
1825
1826       /* Don't support analyzing niter under assumptions for inner
1827          loop.  */
1828       if (!integer_onep (inner.assumptions))
1829         return opt_result::failure_at (vect_location,
1830                                        "not vectorized: Bad inner loop.\n");
1831
1832       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1833         return opt_result::failure_at (vect_location,
1834                                        "not vectorized: inner-loop count not"
1835                                        " invariant.\n");
1836
1837       if (dump_enabled_p ())
1838         dump_printf_loc (MSG_NOTE, vect_location,
1839                          "Considering outer-loop vectorization.\n");
1840       info->inner_loop_cond = inner.conds[0];
1841     }
1842
1843   if (EDGE_COUNT (loop->header->preds) != 2)
1844     return opt_result::failure_at (vect_location,
1845                                    "not vectorized:"
1846                                    " too many incoming edges.\n");
1847
1848   /* We assume that the loop exit condition is at the end of the loop. i.e,
1849      that the loop is represented as a do-while (with a proper if-guard
1850      before the loop if needed), where the loop header contains all the
1851      executable statements, and the latch is empty.  */
1852   if (!empty_block_p (loop->latch)
1853       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1854     return opt_result::failure_at (vect_location,
1855                                    "not vectorized: latch block not empty.\n");
1856
1857   /* Make sure the exit is not abnormal.  */
1858   auto_vec<edge> exits = get_loop_exit_edges (loop);
1859   for (edge e : exits)
1860     {
1861       if (e->flags & EDGE_ABNORMAL)
1862         return opt_result::failure_at (vect_location,
1863                                        "not vectorized:"
1864                                        " abnormal loop exit edge.\n");
1865     }
1866
1867   info->conds
1868     = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1869                             &info->number_of_iterations,
1870                             &info->number_of_iterationsm1);
1871
1872   if (info->conds.is_empty ())
1873     return opt_result::failure_at
1874       (vect_location,
1875        "not vectorized: complicated exit condition.\n");
1876
1877   /* Determine what the primary and alternate exit conds are.  */
1878   for (unsigned i = 0; i < info->conds.length (); i++)
1879     {
1880       gcond *cond = info->conds[i];
1881       if (exit_e->src == gimple_bb (cond))
1882         std::swap (info->conds[0], info->conds[i]);
1883     }
1884
1885   if (integer_zerop (info->assumptions)
1886       || !info->number_of_iterations
1887       || chrec_contains_undetermined (info->number_of_iterations))
1888     return opt_result::failure_at
1889       (info->conds[0],
1890        "not vectorized: number of iterations cannot be computed.\n");
1891
1892   if (integer_zerop (info->number_of_iterations))
1893     return opt_result::failure_at
1894       (info->conds[0],
1895        "not vectorized: number of iterations = 0.\n");
1896
1897   if (!(tree_fits_shwi_p (info->number_of_iterations)
1898         && tree_to_shwi (info->number_of_iterations) > 0))
1899     {
1900       if (dump_enabled_p ())
1901         {
1902           dump_printf_loc (MSG_NOTE, vect_location,
1903                            "Symbolic number of iterations is ");
1904           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1905           dump_printf (MSG_NOTE, "\n");
1906         }
1907     }
1908
1909   return opt_result::success ();
1910 }
1911
1912 /* Create a loop_vec_info for LOOP with SHARED and the
1913    vect_analyze_loop_form result.  */
1914
1915 loop_vec_info
1916 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1917                         const vect_loop_form_info *info,
1918                         loop_vec_info main_loop_info)
1919 {
1920   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1921   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1922   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1923   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1924   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1925   /* Also record the assumptions for versioning.  */
1926   if (!integer_onep (info->assumptions) && !main_loop_info)
1927     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1928
1929   for (gcond *cond : info->conds)
1930     {
1931       stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1932       STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1933       /* Mark the statement as a condition.  */
1934       STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1935     }
1936
1937   for (unsigned i = 1; i < info->conds.length (); i ++)
1938     LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1939   LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1940
1941   LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1942
1943   /* Check to see if we're vectorizing multiple exits.  */
1944   LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1945     = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1946
1947   if (info->inner_loop_cond)
1948     {
1949       stmt_vec_info inner_loop_cond_info
1950         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1951       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1952       /* If we have an estimate on the number of iterations of the inner
1953          loop use that to limit the scale for costing, otherwise use
1954          --param vect-inner-loop-cost-factor literally.  */
1955       widest_int nit;
1956       if (estimated_stmt_executions (loop->inner, &nit))
1957         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1958           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1959     }
1960
1961   return loop_vinfo;
1962 }
1963
1964
1965
1966 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1967    statements update the vectorization factor.  */
1968
1969 static void
1970 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1971 {
1972   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1973   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1974   int nbbs = loop->num_nodes;
1975   poly_uint64 vectorization_factor;
1976   int i;
1977
1978   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1979
1980   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1981   gcc_assert (known_ne (vectorization_factor, 0U));
1982
1983   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1984      vectorization factor of the loop is the unrolling factor required by
1985      the SLP instances.  If that unrolling factor is 1, we say, that we
1986      perform pure SLP on loop - cross iteration parallelism is not
1987      exploited.  */
1988   bool only_slp_in_loop = true;
1989   for (i = 0; i < nbbs; i++)
1990     {
1991       basic_block bb = bbs[i];
1992       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1993            gsi_next (&si))
1994         {
1995           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1996           if (!stmt_info)
1997             continue;
1998           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1999                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2000               && !PURE_SLP_STMT (stmt_info))
2001             /* STMT needs both SLP and loop-based vectorization.  */
2002             only_slp_in_loop = false;
2003         }
2004       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2005            gsi_next (&si))
2006         {
2007           if (is_gimple_debug (gsi_stmt (si)))
2008             continue;
2009           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2010           stmt_info = vect_stmt_to_vectorize (stmt_info);
2011           if ((STMT_VINFO_RELEVANT_P (stmt_info)
2012                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2013               && !PURE_SLP_STMT (stmt_info))
2014             /* STMT needs both SLP and loop-based vectorization.  */
2015             only_slp_in_loop = false;
2016         }
2017     }
2018
2019   if (only_slp_in_loop)
2020     {
2021       if (dump_enabled_p ())
2022         dump_printf_loc (MSG_NOTE, vect_location,
2023                          "Loop contains only SLP stmts\n");
2024       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
2025     }
2026   else
2027     {
2028       if (dump_enabled_p ())
2029         dump_printf_loc (MSG_NOTE, vect_location,
2030                          "Loop contains SLP and non-SLP stmts\n");
2031       /* Both the vectorization factor and unroll factor have the form
2032          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2033          so they must have a common multiple.  */
2034       vectorization_factor
2035         = force_common_multiple (vectorization_factor,
2036                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2037     }
2038
2039   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2040   if (dump_enabled_p ())
2041     {
2042       dump_printf_loc (MSG_NOTE, vect_location,
2043                        "Updating vectorization factor to ");
2044       dump_dec (MSG_NOTE, vectorization_factor);
2045       dump_printf (MSG_NOTE, ".\n");
2046     }
2047 }
2048
2049 /* Return true if STMT_INFO describes a double reduction phi and if
2050    the other phi in the reduction is also relevant for vectorization.
2051    This rejects cases such as:
2052
2053       outer1:
2054         x_1 = PHI <x_3(outer2), ...>;
2055         ...
2056
2057       inner:
2058         x_2 = ...;
2059         ...
2060
2061       outer2:
2062         x_3 = PHI <x_2(inner)>;
2063
2064    if nothing in x_2 or elsewhere makes x_1 relevant.  */
2065
2066 static bool
2067 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2068 {
2069   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2070     return false;
2071
2072   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2073 }
2074
2075 /* Function vect_analyze_loop_operations.
2076
2077    Scan the loop stmts and make sure they are all vectorizable.  */
2078
2079 static opt_result
2080 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2081 {
2082   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2083   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2084   int nbbs = loop->num_nodes;
2085   int i;
2086   stmt_vec_info stmt_info;
2087   bool need_to_vectorize = false;
2088   bool ok;
2089
2090   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2091
2092   auto_vec<stmt_info_for_cost> cost_vec;
2093
2094   for (i = 0; i < nbbs; i++)
2095     {
2096       basic_block bb = bbs[i];
2097
2098       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2099            gsi_next (&si))
2100         {
2101           gphi *phi = si.phi ();
2102           ok = true;
2103
2104           stmt_info = loop_vinfo->lookup_stmt (phi);
2105           if (dump_enabled_p ())
2106             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2107                              (gimple *) phi);
2108           if (virtual_operand_p (gimple_phi_result (phi)))
2109             continue;
2110
2111           /* Inner-loop loop-closed exit phi in outer-loop vectorization
2112              (i.e., a phi in the tail of the outer-loop).  */
2113           if (! is_loop_header_bb_p (bb))
2114             {
2115               /* FORNOW: we currently don't support the case that these phis
2116                  are not used in the outerloop (unless it is double reduction,
2117                  i.e., this phi is vect_reduction_def), cause this case
2118                  requires to actually do something here.  */
2119               if (STMT_VINFO_LIVE_P (stmt_info)
2120                   && !vect_active_double_reduction_p (stmt_info))
2121                 return opt_result::failure_at (phi,
2122                                                "Unsupported loop-closed phi"
2123                                                " in outer-loop.\n");
2124
2125               /* If PHI is used in the outer loop, we check that its operand
2126                  is defined in the inner loop.  */
2127               if (STMT_VINFO_RELEVANT_P (stmt_info))
2128                 {
2129                   tree phi_op;
2130
2131                   if (gimple_phi_num_args (phi) != 1)
2132                     return opt_result::failure_at (phi, "unsupported phi");
2133
2134                   phi_op = PHI_ARG_DEF (phi, 0);
2135                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2136                   if (!op_def_info)
2137                     return opt_result::failure_at (phi, "unsupported phi\n");
2138
2139                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2140                       && (STMT_VINFO_RELEVANT (op_def_info)
2141                           != vect_used_in_outer_by_reduction))
2142                     return opt_result::failure_at (phi, "unsupported phi\n");
2143
2144                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2145                        || (STMT_VINFO_DEF_TYPE (stmt_info)
2146                            == vect_double_reduction_def))
2147                       && !vectorizable_lc_phi (loop_vinfo,
2148                                                stmt_info, NULL, NULL))
2149                     return opt_result::failure_at (phi, "unsupported phi\n");
2150                 }
2151
2152               continue;
2153             }
2154
2155           gcc_assert (stmt_info);
2156
2157           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2158                || STMT_VINFO_LIVE_P (stmt_info))
2159               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2160               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2161             /* A scalar-dependence cycle that we don't support.  */
2162             return opt_result::failure_at (phi,
2163                                            "not vectorized:"
2164                                            " scalar dependence cycle.\n");
2165
2166           if (STMT_VINFO_RELEVANT_P (stmt_info))
2167             {
2168               need_to_vectorize = true;
2169               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2170                   && ! PURE_SLP_STMT (stmt_info))
2171                 ok = vectorizable_induction (loop_vinfo,
2172                                              stmt_info, NULL, NULL,
2173                                              &cost_vec);
2174               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2175                         || (STMT_VINFO_DEF_TYPE (stmt_info)
2176                             == vect_double_reduction_def)
2177                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2178                        && ! PURE_SLP_STMT (stmt_info))
2179                 ok = vectorizable_reduction (loop_vinfo,
2180                                              stmt_info, NULL, NULL, &cost_vec);
2181               else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2182                         == vect_first_order_recurrence)
2183                        && ! PURE_SLP_STMT (stmt_info))
2184                 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2185                                            &cost_vec);
2186             }
2187
2188           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
2189           if (ok
2190               && STMT_VINFO_LIVE_P (stmt_info)
2191               && !PURE_SLP_STMT (stmt_info))
2192             ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2193                                               -1, false, &cost_vec);
2194
2195           if (!ok)
2196             return opt_result::failure_at (phi,
2197                                            "not vectorized: relevant phi not "
2198                                            "supported: %G",
2199                                            static_cast <gimple *> (phi));
2200         }
2201
2202       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2203            gsi_next (&si))
2204         {
2205           gimple *stmt = gsi_stmt (si);
2206           if (!gimple_clobber_p (stmt)
2207               && !is_gimple_debug (stmt))
2208             {
2209               opt_result res
2210                 = vect_analyze_stmt (loop_vinfo,
2211                                      loop_vinfo->lookup_stmt (stmt),
2212                                      &need_to_vectorize,
2213                                      NULL, NULL, &cost_vec);
2214               if (!res)
2215                 return res;
2216             }
2217         }
2218     } /* bbs */
2219
2220   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2221
2222   /* All operations in the loop are either irrelevant (deal with loop
2223      control, or dead), or only used outside the loop and can be moved
2224      out of the loop (e.g. invariants, inductions).  The loop can be
2225      optimized away by scalar optimizations.  We're better off not
2226      touching this loop.  */
2227   if (!need_to_vectorize)
2228     {
2229       if (dump_enabled_p ())
2230         dump_printf_loc (MSG_NOTE, vect_location,
2231                          "All the computation can be taken out of the loop.\n");
2232       return opt_result::failure_at
2233         (vect_location,
2234          "not vectorized: redundant loop. no profit to vectorize.\n");
2235     }
2236
2237   return opt_result::success ();
2238 }
2239
2240 /* Return true if we know that the iteration count is smaller than the
2241    vectorization factor.  Return false if it isn't, or if we can't be sure
2242    either way.  */
2243
2244 static bool
2245 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2246 {
2247   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2248
2249   HOST_WIDE_INT max_niter;
2250   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2251     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2252   else
2253     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2254
2255   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2256     return true;
2257
2258   return false;
2259 }
2260
2261 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
2262    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
2263    definitely no, or -1 if it's worth retrying.  */
2264
2265 static int
2266 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2267                            unsigned *suggested_unroll_factor)
2268 {
2269   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2270   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2271
2272   /* Only loops that can handle partially-populated vectors can have iteration
2273      counts less than the vectorization factor.  */
2274   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2275       && vect_known_niters_smaller_than_vf (loop_vinfo))
2276     {
2277       if (dump_enabled_p ())
2278         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2279                          "not vectorized: iteration count smaller than "
2280                          "vectorization factor.\n");
2281       return 0;
2282     }
2283
2284   /* If we know the number of iterations we can do better, for the
2285      epilogue we can also decide whether the main loop leaves us
2286      with enough iterations, prefering a smaller vector epilog then
2287      also possibly used for the case we skip the vector loop.  */
2288   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2289     {
2290       widest_int scalar_niters
2291         = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2292       if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2293         {
2294           loop_vec_info orig_loop_vinfo
2295             = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2296           unsigned lowest_vf
2297             = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2298           int prolog_peeling = 0;
2299           if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2300             prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2301           if (prolog_peeling >= 0
2302               && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2303                            lowest_vf))
2304             {
2305               unsigned gap
2306                 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2307               scalar_niters = ((scalar_niters - gap - prolog_peeling)
2308                                % lowest_vf + gap);
2309             }
2310         }
2311       /* Reject vectorizing for a single scalar iteration, even if
2312          we could in principle implement that using partial vectors.  */
2313       unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2314       if (scalar_niters <= peeling_gap + 1)
2315         {
2316           if (dump_enabled_p ())
2317             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2318                              "not vectorized: loop only has a single "
2319                              "scalar iteration.\n");
2320           return 0;
2321         }
2322
2323       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2324         {
2325           /* Check that the loop processes at least one full vector.  */
2326           poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2327           if (known_lt (scalar_niters, vf))
2328             {
2329               if (dump_enabled_p ())
2330                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2331                                  "loop does not have enough iterations "
2332                                  "to support vectorization.\n");
2333               return 0;
2334             }
2335
2336           /* If we need to peel an extra epilogue iteration to handle data
2337              accesses with gaps, check that there are enough scalar iterations
2338              available.
2339
2340              The check above is redundant with this one when peeling for gaps,
2341              but the distinction is useful for diagnostics.  */
2342           if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2343               && known_le (scalar_niters, vf))
2344             {
2345               if (dump_enabled_p ())
2346                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2347                                  "loop does not have enough iterations "
2348                                  "to support peeling for gaps.\n");
2349               return 0;
2350             }
2351         }
2352     }
2353
2354   /* If using the "very cheap" model. reject cases in which we'd keep
2355      a copy of the scalar code (even if we might be able to vectorize it).  */
2356   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2357       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2358           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2359           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2360     {
2361       if (dump_enabled_p ())
2362         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2363                          "some scalar iterations would need to be peeled\n");
2364       return 0;
2365     }
2366
2367   int min_profitable_iters, min_profitable_estimate;
2368   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2369                                       &min_profitable_estimate,
2370                                       suggested_unroll_factor);
2371
2372   if (min_profitable_iters < 0)
2373     {
2374       if (dump_enabled_p ())
2375         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2376                          "not vectorized: vectorization not profitable.\n");
2377       if (dump_enabled_p ())
2378         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2379                          "not vectorized: vector version will never be "
2380                          "profitable.\n");
2381       return -1;
2382     }
2383
2384   int min_scalar_loop_bound = (param_min_vect_loop_bound
2385                                * assumed_vf);
2386
2387   /* Use the cost model only if it is more conservative than user specified
2388      threshold.  */
2389   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2390                                     min_profitable_iters);
2391
2392   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2393
2394   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2395       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2396     {
2397       if (dump_enabled_p ())
2398         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2399                          "not vectorized: vectorization not profitable.\n");
2400       if (dump_enabled_p ())
2401         dump_printf_loc (MSG_NOTE, vect_location,
2402                          "not vectorized: iteration count smaller than user "
2403                          "specified loop bound parameter or minimum profitable "
2404                          "iterations (whichever is more conservative).\n");
2405       return 0;
2406     }
2407
2408   /* The static profitablity threshold min_profitable_estimate includes
2409      the cost of having to check at runtime whether the scalar loop
2410      should be used instead.  If it turns out that we don't need or want
2411      such a check, the threshold we should use for the static estimate
2412      is simply the point at which the vector loop becomes more profitable
2413      than the scalar loop.  */
2414   if (min_profitable_estimate > min_profitable_iters
2415       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2416       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2417       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2418       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2419     {
2420       if (dump_enabled_p ())
2421         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2422                          " choice between the scalar and vector loops\n");
2423       min_profitable_estimate = min_profitable_iters;
2424     }
2425
2426   /* If the vector loop needs multiple iterations to be beneficial then
2427      things are probably too close to call, and the conservative thing
2428      would be to stick with the scalar code.  */
2429   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2430       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2431     {
2432       if (dump_enabled_p ())
2433         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2434                          "one iteration of the vector loop would be"
2435                          " more expensive than the equivalent number of"
2436                          " iterations of the scalar loop\n");
2437       return 0;
2438     }
2439
2440   HOST_WIDE_INT estimated_niter;
2441
2442   /* If we are vectorizing an epilogue then we know the maximum number of
2443      scalar iterations it will cover is at least one lower than the
2444      vectorization factor of the main loop.  */
2445   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2446     estimated_niter
2447       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2448   else
2449     {
2450       estimated_niter = estimated_stmt_executions_int (loop);
2451       if (estimated_niter == -1)
2452         estimated_niter = likely_max_stmt_executions_int (loop);
2453     }
2454   if (estimated_niter != -1
2455       && ((unsigned HOST_WIDE_INT) estimated_niter
2456           < MAX (th, (unsigned) min_profitable_estimate)))
2457     {
2458       if (dump_enabled_p ())
2459         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2460                          "not vectorized: estimated iteration count too "
2461                          "small.\n");
2462       if (dump_enabled_p ())
2463         dump_printf_loc (MSG_NOTE, vect_location,
2464                          "not vectorized: estimated iteration count smaller "
2465                          "than specified loop bound parameter or minimum "
2466                          "profitable iterations (whichever is more "
2467                          "conservative).\n");
2468       return -1;
2469     }
2470
2471   return 1;
2472 }
2473
2474 static opt_result
2475 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2476                            vec<data_reference_p> *datarefs,
2477                            unsigned int *n_stmts)
2478 {
2479   *n_stmts = 0;
2480   for (unsigned i = 0; i < loop->num_nodes; i++)
2481     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2482          !gsi_end_p (gsi); gsi_next (&gsi))
2483       {
2484         gimple *stmt = gsi_stmt (gsi);
2485         if (is_gimple_debug (stmt))
2486           continue;
2487         ++(*n_stmts);
2488         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2489                                                         NULL, 0);
2490         if (!res)
2491           {
2492             if (is_gimple_call (stmt) && loop->safelen)
2493               {
2494                 tree fndecl = gimple_call_fndecl (stmt), op;
2495                 if (fndecl == NULL_TREE
2496                     && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2497                   {
2498                     fndecl = gimple_call_arg (stmt, 0);
2499                     gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2500                     fndecl = TREE_OPERAND (fndecl, 0);
2501                     gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2502                   }
2503                 if (fndecl != NULL_TREE)
2504                   {
2505                     cgraph_node *node = cgraph_node::get (fndecl);
2506                     if (node != NULL && node->simd_clones != NULL)
2507                       {
2508                         unsigned int j, n = gimple_call_num_args (stmt);
2509                         for (j = 0; j < n; j++)
2510                           {
2511                             op = gimple_call_arg (stmt, j);
2512                             if (DECL_P (op)
2513                                 || (REFERENCE_CLASS_P (op)
2514                                     && get_base_address (op)))
2515                               break;
2516                           }
2517                         op = gimple_call_lhs (stmt);
2518                         /* Ignore #pragma omp declare simd functions
2519                            if they don't have data references in the
2520                            call stmt itself.  */
2521                         if (j == n
2522                             && !(op
2523                                  && (DECL_P (op)
2524                                      || (REFERENCE_CLASS_P (op)
2525                                          && get_base_address (op)))))
2526                           continue;
2527                       }
2528                   }
2529               }
2530             return res;
2531           }
2532         /* If dependence analysis will give up due to the limit on the
2533            number of datarefs stop here and fail fatally.  */
2534         if (datarefs->length ()
2535             > (unsigned)param_loop_max_datarefs_for_datadeps)
2536           return opt_result::failure_at (stmt, "exceeded param "
2537                                          "loop-max-datarefs-for-datadeps\n");
2538       }
2539   return opt_result::success ();
2540 }
2541
2542 /* Look for SLP-only access groups and turn each individual access into its own
2543    group.  */
2544 static void
2545 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2546 {
2547   unsigned int i;
2548   struct data_reference *dr;
2549
2550   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2551
2552   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2553   FOR_EACH_VEC_ELT (datarefs, i, dr)
2554     {
2555       gcc_assert (DR_REF (dr));
2556       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2557
2558       /* Check if the load is a part of an interleaving chain.  */
2559       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2560         {
2561           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2562           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2563           unsigned int group_size = DR_GROUP_SIZE (first_element);
2564
2565           /* Check if SLP-only groups.  */
2566           if (!STMT_SLP_TYPE (stmt_info)
2567               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2568             {
2569               /* Dissolve the group.  */
2570               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2571
2572               stmt_vec_info vinfo = first_element;
2573               while (vinfo)
2574                 {
2575                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2576                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2577                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2578                   DR_GROUP_SIZE (vinfo) = 1;
2579                   if (STMT_VINFO_STRIDED_P (first_element)
2580                       /* We cannot handle stores with gaps.  */
2581                       || DR_IS_WRITE (dr_info->dr))
2582                     {
2583                       STMT_VINFO_STRIDED_P (vinfo) = true;
2584                       DR_GROUP_GAP (vinfo) = 0;
2585                     }
2586                   else
2587                     DR_GROUP_GAP (vinfo) = group_size - 1;
2588                   /* Duplicate and adjust alignment info, it needs to
2589                      be present on each group leader, see dr_misalignment.  */
2590                   if (vinfo != first_element)
2591                     {
2592                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2593                       dr_info2->target_alignment = dr_info->target_alignment;
2594                       int misalignment = dr_info->misalignment;
2595                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2596                         {
2597                           HOST_WIDE_INT diff
2598                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2599                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2600                           unsigned HOST_WIDE_INT align_c
2601                             = dr_info->target_alignment.to_constant ();
2602                           misalignment = (misalignment + diff) % align_c;
2603                         }
2604                       dr_info2->misalignment = misalignment;
2605                     }
2606                   vinfo = next;
2607                 }
2608             }
2609         }
2610     }
2611 }
2612
2613 /* Determine if operating on full vectors for LOOP_VINFO might leave
2614    some scalar iterations still to do.  If so, decide how we should
2615    handle those scalar iterations.  The possibilities are:
2616
2617    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2618        In this case:
2619
2620          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2621          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2622          LOOP_VINFO_PEELING_FOR_NITER == false
2623
2624    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2625        to handle the remaining scalar iterations.  In this case:
2626
2627          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2628          LOOP_VINFO_PEELING_FOR_NITER == true
2629
2630        There are two choices:
2631
2632        (2a) Consider vectorizing the epilogue loop at the same VF as the
2633             main loop, but using partial vectors instead of full vectors.
2634             In this case:
2635
2636               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2637
2638        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2639             In this case:
2640
2641               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2642  */
2643
2644 opt_result
2645 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2646 {
2647   /* Determine whether there would be any scalar iterations left over.  */
2648   bool need_peeling_or_partial_vectors_p
2649     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2650
2651   /* Decide whether to vectorize the loop with partial vectors.  */
2652   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2653   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2654   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2655       && need_peeling_or_partial_vectors_p)
2656     {
2657       /* For partial-vector-usage=1, try to push the handling of partial
2658          vectors to the epilogue, with the main loop continuing to operate
2659          on full vectors.
2660
2661          If we are unrolling we also do not want to use partial vectors. This
2662          is to avoid the overhead of generating multiple masks and also to
2663          avoid having to execute entire iterations of FALSE masked instructions
2664          when dealing with one or less full iterations.
2665
2666          ??? We could then end up failing to use partial vectors if we
2667          decide to peel iterations into a prologue, and if the main loop
2668          then ends up processing fewer than VF iterations.  */
2669       if ((param_vect_partial_vector_usage == 1
2670            || loop_vinfo->suggested_unroll_factor > 1)
2671           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2672           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2673         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2674       else
2675         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2676     }
2677
2678   if (dump_enabled_p ())
2679     dump_printf_loc (MSG_NOTE, vect_location,
2680                      "operating on %s vectors%s.\n",
2681                      LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2682                      ? "partial" : "full",
2683                      LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2684                      ? " for epilogue loop" : "");
2685
2686   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2687     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2688        && need_peeling_or_partial_vectors_p);
2689
2690   /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2691      analysis that we don't know whether the loop is vectorized by partial
2692      vectors (More details see tree-vect-loop-manip.cc).
2693
2694      However, SELECT_VL vectorizaton style should only applied on partial
2695      vectorization since SELECT_VL is the GIMPLE IR that calculates the
2696      number of elements to be process for each iteration.
2697
2698      After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2699      if it is not partial vectorized loop.  */
2700   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2701     LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2702
2703   return opt_result::success ();
2704 }
2705
2706 /* Function vect_analyze_loop_2.
2707
2708    Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2709    analyses will record information in some members of LOOP_VINFO.  FATAL
2710    indicates if some analysis meets fatal error.  If one non-NULL pointer
2711    SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2712    worked out suggested unroll factor, while one NULL pointer shows it's
2713    going to apply the suggested unroll factor.  SLP_DONE_FOR_SUGGESTED_UF
2714    is to hold the slp decision when the suggested unroll factor is worked
2715    out.  */
2716 static opt_result
2717 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2718                      unsigned *suggested_unroll_factor,
2719                      bool& slp_done_for_suggested_uf)
2720 {
2721   opt_result ok = opt_result::success ();
2722   int res;
2723   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2724   poly_uint64 min_vf = 2;
2725   loop_vec_info orig_loop_vinfo = NULL;
2726
2727   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2728      loop_vec_info of the first vectorized loop.  */
2729   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2730     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2731   else
2732     orig_loop_vinfo = loop_vinfo;
2733   gcc_assert (orig_loop_vinfo);
2734
2735   /* The first group of checks is independent of the vector size.  */
2736   fatal = true;
2737
2738   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2739       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2740     return opt_result::failure_at (vect_location,
2741                                    "not vectorized: simd if(0)\n");
2742
2743   /* Find all data references in the loop (which correspond to vdefs/vuses)
2744      and analyze their evolution in the loop.  */
2745
2746   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2747
2748   /* Gather the data references and count stmts in the loop.  */
2749   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2750     {
2751       opt_result res
2752         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2753                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2754                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2755       if (!res)
2756         {
2757           if (dump_enabled_p ())
2758             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2759                              "not vectorized: loop contains function "
2760                              "calls or data references that cannot "
2761                              "be analyzed\n");
2762           return res;
2763         }
2764       loop_vinfo->shared->save_datarefs ();
2765     }
2766   else
2767     loop_vinfo->shared->check_datarefs ();
2768
2769   /* Analyze the data references and also adjust the minimal
2770      vectorization factor according to the loads and stores.  */
2771
2772   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2773   if (!ok)
2774     {
2775       if (dump_enabled_p ())
2776         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2777                          "bad data references.\n");
2778       return ok;
2779     }
2780
2781   /* Check if we are applying unroll factor now.  */
2782   bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2783   gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2784
2785   /* If the slp decision is false when suggested unroll factor is worked
2786      out, and we are applying suggested unroll factor, we can simply skip
2787      all slp related analyses this time.  */
2788   bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2789
2790   /* Classify all cross-iteration scalar data-flow cycles.
2791      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2792   vect_analyze_scalar_cycles (loop_vinfo, slp);
2793
2794   vect_pattern_recog (loop_vinfo);
2795
2796   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2797
2798   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2799      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2800
2801   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2802   if (!ok)
2803     {
2804       if (dump_enabled_p ())
2805         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2806                          "bad data access.\n");
2807       return ok;
2808     }
2809
2810   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2811
2812   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2813   if (!ok)
2814     {
2815       if (dump_enabled_p ())
2816         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2817                          "unexpected pattern.\n");
2818       return ok;
2819     }
2820
2821   /* While the rest of the analysis below depends on it in some way.  */
2822   fatal = false;
2823
2824   /* Analyze data dependences between the data-refs in the loop
2825      and adjust the maximum vectorization factor according to
2826      the dependences.
2827      FORNOW: fail at the first data dependence that we encounter.  */
2828
2829   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2830   if (!ok)
2831     {
2832       if (dump_enabled_p ())
2833         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2834                          "bad data dependence.\n");
2835       return ok;
2836     }
2837   if (max_vf != MAX_VECTORIZATION_FACTOR
2838       && maybe_lt (max_vf, min_vf))
2839     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2840   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2841
2842   ok = vect_determine_vectorization_factor (loop_vinfo);
2843   if (!ok)
2844     {
2845       if (dump_enabled_p ())
2846         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2847                          "can't determine vectorization factor.\n");
2848       return ok;
2849     }
2850
2851   /* Compute the scalar iteration cost.  */
2852   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2853
2854   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2855
2856   if (slp)
2857     {
2858       /* Check the SLP opportunities in the loop, analyze and build
2859          SLP trees.  */
2860       ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2861       if (!ok)
2862         return ok;
2863
2864       /* If there are any SLP instances mark them as pure_slp.  */
2865       slp = vect_make_slp_decision (loop_vinfo);
2866       if (slp)
2867         {
2868           /* Find stmts that need to be both vectorized and SLPed.  */
2869           vect_detect_hybrid_slp (loop_vinfo);
2870
2871           /* Update the vectorization factor based on the SLP decision.  */
2872           vect_update_vf_for_slp (loop_vinfo);
2873
2874           /* Optimize the SLP graph with the vectorization factor fixed.  */
2875           vect_optimize_slp (loop_vinfo);
2876
2877           /* Gather the loads reachable from the SLP graph entries.  */
2878           vect_gather_slp_loads (loop_vinfo);
2879         }
2880     }
2881
2882   bool saved_can_use_partial_vectors_p
2883     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2884
2885   /* We don't expect to have to roll back to anything other than an empty
2886      set of rgroups.  */
2887   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2888
2889   /* This is the point where we can re-start analysis with SLP forced off.  */
2890 start_over:
2891
2892   /* Apply the suggested unrolling factor, this was determined by the backend
2893      during finish_cost the first time we ran the analyzis for this
2894      vector mode.  */
2895   if (applying_suggested_uf)
2896     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2897
2898   /* Now the vectorization factor is final.  */
2899   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2900   gcc_assert (known_ne (vectorization_factor, 0U));
2901
2902   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2903     {
2904       dump_printf_loc (MSG_NOTE, vect_location,
2905                        "vectorization_factor = ");
2906       dump_dec (MSG_NOTE, vectorization_factor);
2907       dump_printf (MSG_NOTE, ", niters = %wd\n",
2908                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2909     }
2910
2911   if (max_vf != MAX_VECTORIZATION_FACTOR
2912       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2913     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2914
2915   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2916
2917   /* Analyze the alignment of the data-refs in the loop.
2918      Fail if a data reference is found that cannot be vectorized.  */
2919
2920   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2921   if (!ok)
2922     {
2923       if (dump_enabled_p ())
2924         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2925                          "bad data alignment.\n");
2926       return ok;
2927     }
2928
2929   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2930      It is important to call pruning after vect_analyze_data_ref_accesses,
2931      since we use grouping information gathered by interleaving analysis.  */
2932   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2933   if (!ok)
2934     return ok;
2935
2936   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2937      vectorization, since we do not want to add extra peeling or
2938      add versioning for alignment.  */
2939   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2940     /* This pass will decide on using loop versioning and/or loop peeling in
2941        order to enhance the alignment of data references in the loop.  */
2942     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2943   if (!ok)
2944     return ok;
2945
2946   if (slp)
2947     {
2948       /* Analyze operations in the SLP instances.  Note this may
2949          remove unsupported SLP instances which makes the above
2950          SLP kind detection invalid.  */
2951       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2952       vect_slp_analyze_operations (loop_vinfo);
2953       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2954         {
2955           ok = opt_result::failure_at (vect_location,
2956                                        "unsupported SLP instances\n");
2957           goto again;
2958         }
2959
2960       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2961       slp_tree load_node, slp_root;
2962       unsigned i, x;
2963       slp_instance instance;
2964       bool can_use_lanes = true;
2965       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2966         {
2967           slp_root = SLP_INSTANCE_TREE (instance);
2968           int group_size = SLP_TREE_LANES (slp_root);
2969           tree vectype = SLP_TREE_VECTYPE (slp_root);
2970           bool loads_permuted = false;
2971           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2972             {
2973               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2974                 continue;
2975               unsigned j;
2976               stmt_vec_info load_info;
2977               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2978                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2979                   {
2980                     loads_permuted = true;
2981                     break;
2982                   }
2983             }
2984
2985           /* If the loads and stores can be handled with load/store-lane
2986              instructions record it and move on to the next instance.  */
2987           if (loads_permuted
2988               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2989               && vect_store_lanes_supported (vectype, group_size, false)
2990                    != IFN_LAST)
2991             {
2992               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2993                 if (STMT_VINFO_GROUPED_ACCESS
2994                       (SLP_TREE_REPRESENTATIVE (load_node)))
2995                   {
2996                     stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2997                         (SLP_TREE_REPRESENTATIVE (load_node));
2998                     /* Use SLP for strided accesses (or if we can't
2999                        load-lanes).  */
3000                     if (STMT_VINFO_STRIDED_P (stmt_vinfo)
3001                         || vect_load_lanes_supported
3002                              (STMT_VINFO_VECTYPE (stmt_vinfo),
3003                               DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
3004                       break;
3005                   }
3006
3007               can_use_lanes
3008                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
3009
3010               if (can_use_lanes && dump_enabled_p ())
3011                 dump_printf_loc (MSG_NOTE, vect_location,
3012                                  "SLP instance %p can use load/store-lanes\n",
3013                                  (void *) instance);
3014             }
3015           else
3016             {
3017               can_use_lanes = false;
3018               break;
3019             }
3020         }
3021
3022       /* If all SLP instances can use load/store-lanes abort SLP and try again
3023          with SLP disabled.  */
3024       if (can_use_lanes)
3025         {
3026           ok = opt_result::failure_at (vect_location,
3027                                        "Built SLP cancelled: can use "
3028                                        "load/store-lanes\n");
3029           if (dump_enabled_p ())
3030             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3031                              "Built SLP cancelled: all SLP instances support "
3032                              "load/store-lanes\n");
3033           goto again;
3034         }
3035     }
3036
3037   /* Dissolve SLP-only groups.  */
3038   vect_dissolve_slp_only_groups (loop_vinfo);
3039
3040   /* Scan all the remaining operations in the loop that are not subject
3041      to SLP and make sure they are vectorizable.  */
3042   ok = vect_analyze_loop_operations (loop_vinfo);
3043   if (!ok)
3044     {
3045       if (dump_enabled_p ())
3046         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3047                          "bad operation or unsupported loop bound.\n");
3048       return ok;
3049     }
3050
3051   /* For now, we don't expect to mix both masking and length approaches for one
3052      loop, disable it if both are recorded.  */
3053   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3054       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3055       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3056     {
3057       if (dump_enabled_p ())
3058         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3059                          "can't vectorize a loop with partial vectors"
3060                          " because we don't expect to mix different"
3061                          " approaches with partial vectors for the"
3062                          " same loop.\n");
3063       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3064     }
3065
3066   /* If we still have the option of using partial vectors,
3067      check whether we can generate the necessary loop controls.  */
3068   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3069     {
3070       if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3071         {
3072           if (!vect_verify_full_masking (loop_vinfo)
3073               && !vect_verify_full_masking_avx512 (loop_vinfo))
3074             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3075         }
3076       else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3077         if (!vect_verify_loop_lens (loop_vinfo))
3078           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3079     }
3080
3081   /* If we're vectorizing a loop that uses length "controls" and
3082      can iterate more than once, we apply decrementing IV approach
3083      in loop control.  */
3084   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3085       && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3086       && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3087       && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3088            && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3089                         LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3090     LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3091
3092   /* If a loop uses length controls and has a decrementing loop control IV,
3093      we will normally pass that IV through a MIN_EXPR to calcaluate the
3094      basis for the length controls.  E.g. in a loop that processes one
3095      element per scalar iteration, the number of elements would be
3096      MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3097
3098      This MIN_EXPR approach allows us to use pointer IVs with an invariant
3099      step, since only the final iteration of the vector loop can have
3100      inactive lanes.
3101
3102      However, some targets have a dedicated instruction for calculating the
3103      preferred length, given the total number of elements that still need to
3104      be processed.  This is encapsulated in the SELECT_VL internal function.
3105
3106      If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3107      to determine the basis for the length controls.  However, unlike the
3108      MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3109      lanes inactive in any iteration of the vector loop, not just the last
3110      iteration.  This SELECT_VL approach therefore requires us to use pointer
3111      IVs with variable steps.
3112
3113      Once we've decided how many elements should be processed by one
3114      iteration of the vector loop, we need to populate the rgroup controls.
3115      If a loop has multiple rgroups, we need to make sure that those rgroups
3116      "line up" (that is, they must be consistent about which elements are
3117      active and which aren't).  This is done by vect_adjust_loop_lens_control.
3118
3119      In principle, it would be possible to use vect_adjust_loop_lens_control
3120      on either the result of a MIN_EXPR or the result of a SELECT_VL.
3121      However:
3122
3123      (1) In practice, it only makes sense to use SELECT_VL when a vector
3124          operation will be controlled directly by the result.  It is not
3125          worth using SELECT_VL if it would only be the input to other
3126          calculations.
3127
3128      (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3129          pointer IV will need N updates by a variable amount (N-1 updates
3130          within the iteration and 1 update to move to the next iteration).
3131
3132      Because of this, we prefer to use the MIN_EXPR approach whenever there
3133      is more than one length control.
3134
3135      In addition, SELECT_VL always operates to a granularity of 1 unit.
3136      If we wanted to use it to control an SLP operation on N consecutive
3137      elements, we would need to make the SELECT_VL inputs measure scalar
3138      iterations (rather than elements) and then multiply the SELECT_VL
3139      result by N.  But using SELECT_VL this way is inefficient because
3140      of (1) above.
3141
3142      2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3143         satisfied:
3144
3145      (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3146      (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3147
3148      Since SELECT_VL (variable step) will make SCEV analysis failed and then
3149      we will fail to gain benefits of following unroll optimizations. We prefer
3150      using the MIN_EXPR approach in this situation.  */
3151   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3152     {
3153       tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3154       if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3155                                           OPTIMIZE_FOR_SPEED)
3156           && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3157           && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3158           && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3159               || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3160         LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3161     }
3162
3163   /* Decide whether this loop_vinfo should use partial vectors or peeling,
3164      assuming that the loop will be used as a main loop.  We will redo
3165      this analysis later if we instead decide to use the loop as an
3166      epilogue loop.  */
3167   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3168   if (!ok)
3169     return ok;
3170
3171   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3172      to be able to handle fewer than VF scalars, or needs to have a lower VF
3173      than the main loop.  */
3174   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3175       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3176     {
3177       poly_uint64 unscaled_vf
3178         = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3179                      orig_loop_vinfo->suggested_unroll_factor);
3180       if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3181         return opt_result::failure_at (vect_location,
3182                                        "Vectorization factor too high for"
3183                                        " epilogue loop.\n");
3184     }
3185
3186   /* Check the costings of the loop make vectorizing worthwhile.  */
3187   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3188   if (res < 0)
3189     {
3190       ok = opt_result::failure_at (vect_location,
3191                                    "Loop costings may not be worthwhile.\n");
3192       goto again;
3193     }
3194   if (!res)
3195     return opt_result::failure_at (vect_location,
3196                                    "Loop costings not worthwhile.\n");
3197
3198   /* If an epilogue loop is required make sure we can create one.  */
3199   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3200       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
3201       || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3202     {
3203       if (dump_enabled_p ())
3204         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3205       if (!vect_can_advance_ivs_p (loop_vinfo)
3206           || !slpeel_can_duplicate_loop_p (loop,
3207                                            LOOP_VINFO_IV_EXIT (loop_vinfo),
3208                                            LOOP_VINFO_IV_EXIT (loop_vinfo)))
3209         {
3210           ok = opt_result::failure_at (vect_location,
3211                                        "not vectorized: can't create required "
3212                                        "epilog loop\n");
3213           goto again;
3214         }
3215     }
3216
3217   /* During peeling, we need to check if number of loop iterations is
3218      enough for both peeled prolog loop and vector loop.  This check
3219      can be merged along with threshold check of loop versioning, so
3220      increase threshold for this case if necessary.
3221
3222      If we are analyzing an epilogue we still want to check what its
3223      versioning threshold would be.  If we decide to vectorize the epilogues we
3224      will want to use the lowest versioning threshold of all epilogues and main
3225      loop.  This will enable us to enter a vectorized epilogue even when
3226      versioning the loop.  We can't simply check whether the epilogue requires
3227      versioning though since we may have skipped some versioning checks when
3228      analyzing the epilogue.  For instance, checks for alias versioning will be
3229      skipped when dealing with epilogues as we assume we already checked them
3230      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
3231   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3232     {
3233       poly_uint64 niters_th = 0;
3234       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3235
3236       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3237         {
3238           /* Niters for peeled prolog loop.  */
3239           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3240             {
3241               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3242               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3243               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3244             }
3245           else
3246             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3247         }
3248
3249       /* Niters for at least one iteration of vectorized loop.  */
3250       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3251         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3252       /* One additional iteration because of peeling for gap.  */
3253       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3254         niters_th += 1;
3255
3256       /*  Use the same condition as vect_transform_loop to decide when to use
3257           the cost to determine a versioning threshold.  */
3258       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3259           && ordered_p (th, niters_th))
3260         niters_th = ordered_max (poly_uint64 (th), niters_th);
3261
3262       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3263     }
3264
3265   gcc_assert (known_eq (vectorization_factor,
3266                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3267
3268   slp_done_for_suggested_uf = slp;
3269
3270   /* Ok to vectorize!  */
3271   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3272   return opt_result::success ();
3273
3274 again:
3275   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
3276   gcc_assert (!ok);
3277
3278   /* Try again with SLP forced off but if we didn't do any SLP there is
3279      no point in re-trying.  */
3280   if (!slp)
3281     return ok;
3282
3283   /* If the slp decision is true when suggested unroll factor is worked
3284      out, and we are applying suggested unroll factor, we don't need to
3285      re-try any more.  */
3286   if (applying_suggested_uf && slp_done_for_suggested_uf)
3287     return ok;
3288
3289   /* If there are reduction chains re-trying will fail anyway.  */
3290   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3291     return ok;
3292
3293   /* Likewise if the grouped loads or stores in the SLP cannot be handled
3294      via interleaving or lane instructions.  */
3295   slp_instance instance;
3296   slp_tree node;
3297   unsigned i, j;
3298   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3299     {
3300       stmt_vec_info vinfo;
3301       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3302       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3303         continue;
3304       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3305       unsigned int size = DR_GROUP_SIZE (vinfo);
3306       tree vectype = STMT_VINFO_VECTYPE (vinfo);
3307       if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3308          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3309          && ! vect_grouped_store_supported (vectype, size))
3310         return opt_result::failure_at (vinfo->stmt,
3311                                        "unsupported grouped store\n");
3312       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3313         {
3314           vinfo = SLP_TREE_REPRESENTATIVE (node);
3315           if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3316             {
3317               vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3318               bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3319               size = DR_GROUP_SIZE (vinfo);
3320               vectype = STMT_VINFO_VECTYPE (vinfo);
3321               if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3322                   && ! vect_grouped_load_supported (vectype, single_element_p,
3323                                                     size))
3324                 return opt_result::failure_at (vinfo->stmt,
3325                                                "unsupported grouped load\n");
3326             }
3327         }
3328     }
3329
3330   if (dump_enabled_p ())
3331     dump_printf_loc (MSG_NOTE, vect_location,
3332                      "re-trying with SLP disabled\n");
3333
3334   /* Roll back state appropriately.  No SLP this time.  */
3335   slp = false;
3336   /* Restore vectorization factor as it were without SLP.  */
3337   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3338   /* Free the SLP instances.  */
3339   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3340     vect_free_slp_instance (instance);
3341   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3342   /* Reset SLP type to loop_vect on all stmts.  */
3343   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3344     {
3345       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3346       for (gimple_stmt_iterator si = gsi_start_phis (bb);
3347            !gsi_end_p (si); gsi_next (&si))
3348         {
3349           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3350           STMT_SLP_TYPE (stmt_info) = loop_vect;
3351           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3352               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3353             {
3354               /* vectorizable_reduction adjusts reduction stmt def-types,
3355                  restore them to that of the PHI.  */
3356               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3357                 = STMT_VINFO_DEF_TYPE (stmt_info);
3358               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3359                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
3360                 = STMT_VINFO_DEF_TYPE (stmt_info);
3361             }
3362         }
3363       for (gimple_stmt_iterator si = gsi_start_bb (bb);
3364            !gsi_end_p (si); gsi_next (&si))
3365         {
3366           if (is_gimple_debug (gsi_stmt (si)))
3367             continue;
3368           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3369           STMT_SLP_TYPE (stmt_info) = loop_vect;
3370           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3371             {
3372               stmt_vec_info pattern_stmt_info
3373                 = STMT_VINFO_RELATED_STMT (stmt_info);
3374               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3375                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3376
3377               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3378               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3379               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3380                    !gsi_end_p (pi); gsi_next (&pi))
3381                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3382                   = loop_vect;
3383             }
3384         }
3385     }
3386   /* Free optimized alias test DDRS.  */
3387   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3388   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3389   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3390   /* Reset target cost data.  */
3391   delete loop_vinfo->vector_costs;
3392   loop_vinfo->vector_costs = nullptr;
3393   /* Reset accumulated rgroup information.  */
3394   LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3395   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3396   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3397   /* Reset assorted flags.  */
3398   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3399   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3400   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3401   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3402   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3403     = saved_can_use_partial_vectors_p;
3404
3405   goto start_over;
3406 }
3407
3408 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3409    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
3410    OLD_LOOP_VINFO is better unless something specifically indicates
3411    otherwise.
3412
3413    Note that this deliberately isn't a partial order.  */
3414
3415 static bool
3416 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3417                           loop_vec_info old_loop_vinfo)
3418 {
3419   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3420   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3421
3422   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3423   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3424
3425   /* Always prefer a VF of loop->simdlen over any other VF.  */
3426   if (loop->simdlen)
3427     {
3428       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3429       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3430       if (new_simdlen_p != old_simdlen_p)
3431         return new_simdlen_p;
3432     }
3433
3434   const auto *old_costs = old_loop_vinfo->vector_costs;
3435   const auto *new_costs = new_loop_vinfo->vector_costs;
3436   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3437     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3438
3439   return new_costs->better_main_loop_than_p (old_costs);
3440 }
3441
3442 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
3443    true if we should.  */
3444
3445 static bool
3446 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3447                         loop_vec_info old_loop_vinfo)
3448 {
3449   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3450     return false;
3451
3452   if (dump_enabled_p ())
3453     dump_printf_loc (MSG_NOTE, vect_location,
3454                      "***** Preferring vector mode %s to vector mode %s\n",
3455                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
3456                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
3457   return true;
3458 }
3459
3460 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3461    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3462    MODE_I to the next mode useful to analyze.
3463    Return the loop_vinfo on success and wrapped null on failure.  */
3464
3465 static opt_loop_vec_info
3466 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3467                      const vect_loop_form_info *loop_form_info,
3468                      loop_vec_info main_loop_vinfo,
3469                      const vector_modes &vector_modes, unsigned &mode_i,
3470                      machine_mode &autodetected_vector_mode,
3471                      bool &fatal)
3472 {
3473   loop_vec_info loop_vinfo
3474     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3475
3476   machine_mode vector_mode = vector_modes[mode_i];
3477   loop_vinfo->vector_mode = vector_mode;
3478   unsigned int suggested_unroll_factor = 1;
3479   bool slp_done_for_suggested_uf = false;
3480
3481   /* Run the main analysis.  */
3482   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3483                                         &suggested_unroll_factor,
3484                                         slp_done_for_suggested_uf);
3485   if (dump_enabled_p ())
3486     dump_printf_loc (MSG_NOTE, vect_location,
3487                      "***** Analysis %s with vector mode %s\n",
3488                      res ? "succeeded" : " failed",
3489                      GET_MODE_NAME (loop_vinfo->vector_mode));
3490
3491   if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3492     {
3493       if (dump_enabled_p ())
3494         dump_printf_loc (MSG_NOTE, vect_location,
3495                          "***** Re-trying analysis for unrolling"
3496                          " with unroll factor %d and slp %s.\n",
3497                          suggested_unroll_factor,
3498                          slp_done_for_suggested_uf ? "on" : "off");
3499       loop_vec_info unroll_vinfo
3500         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3501       unroll_vinfo->vector_mode = vector_mode;
3502       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3503       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3504                                                 slp_done_for_suggested_uf);
3505       if (new_res)
3506         {
3507           delete loop_vinfo;
3508           loop_vinfo = unroll_vinfo;
3509         }
3510       else
3511         delete unroll_vinfo;
3512     }
3513
3514   /* Remember the autodetected vector mode.  */
3515   if (vector_mode == VOIDmode)
3516     autodetected_vector_mode = loop_vinfo->vector_mode;
3517
3518   /* Advance mode_i, first skipping modes that would result in the
3519      same analysis result.  */
3520   while (mode_i + 1 < vector_modes.length ()
3521          && vect_chooses_same_modes_p (loop_vinfo,
3522                                        vector_modes[mode_i + 1]))
3523     {
3524       if (dump_enabled_p ())
3525         dump_printf_loc (MSG_NOTE, vect_location,
3526                          "***** The result for vector mode %s would"
3527                          " be the same\n",
3528                          GET_MODE_NAME (vector_modes[mode_i + 1]));
3529       mode_i += 1;
3530     }
3531   if (mode_i + 1 < vector_modes.length ()
3532       && VECTOR_MODE_P (autodetected_vector_mode)
3533       && (related_vector_mode (vector_modes[mode_i + 1],
3534                                GET_MODE_INNER (autodetected_vector_mode))
3535           == autodetected_vector_mode)
3536       && (related_vector_mode (autodetected_vector_mode,
3537                                GET_MODE_INNER (vector_modes[mode_i + 1]))
3538           == vector_modes[mode_i + 1]))
3539     {
3540       if (dump_enabled_p ())
3541         dump_printf_loc (MSG_NOTE, vect_location,
3542                          "***** Skipping vector mode %s, which would"
3543                          " repeat the analysis for %s\n",
3544                          GET_MODE_NAME (vector_modes[mode_i + 1]),
3545                          GET_MODE_NAME (autodetected_vector_mode));
3546       mode_i += 1;
3547     }
3548   mode_i++;
3549
3550   if (!res)
3551     {
3552       delete loop_vinfo;
3553       if (fatal)
3554         gcc_checking_assert (main_loop_vinfo == NULL);
3555       return opt_loop_vec_info::propagate_failure (res);
3556     }
3557
3558   return opt_loop_vec_info::success (loop_vinfo);
3559 }
3560
3561 /* Function vect_analyze_loop.
3562
3563    Apply a set of analyses on LOOP, and create a loop_vec_info struct
3564    for it.  The different analyses will record information in the
3565    loop_vec_info struct.  */
3566 opt_loop_vec_info
3567 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3568 {
3569   DUMP_VECT_SCOPE ("analyze_loop_nest");
3570
3571   if (loop_outer (loop)
3572       && loop_vec_info_for_loop (loop_outer (loop))
3573       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3574     return opt_loop_vec_info::failure_at (vect_location,
3575                                           "outer-loop already vectorized.\n");
3576
3577   if (!find_loop_nest (loop, &shared->loop_nest))
3578     return opt_loop_vec_info::failure_at
3579       (vect_location,
3580        "not vectorized: loop nest containing two or more consecutive inner"
3581        " loops cannot be vectorized\n");
3582
3583   /* Analyze the loop form.  */
3584   vect_loop_form_info loop_form_info;
3585   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3586   if (!res)
3587     {
3588       if (dump_enabled_p ())
3589         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3590                          "bad loop form.\n");
3591       return opt_loop_vec_info::propagate_failure (res);
3592     }
3593   if (!integer_onep (loop_form_info.assumptions))
3594     {
3595       /* We consider to vectorize this loop by versioning it under
3596          some assumptions.  In order to do this, we need to clear
3597          existing information computed by scev and niter analyzer.  */
3598       scev_reset_htab ();
3599       free_numbers_of_iterations_estimates (loop);
3600       /* Also set flag for this loop so that following scev and niter
3601          analysis are done under the assumptions.  */
3602       loop_constraint_set (loop, LOOP_C_FINITE);
3603     }
3604   else
3605     /* Clear the existing niter information to make sure the nonwrapping flag
3606        will be calculated and set propriately.  */
3607     free_numbers_of_iterations_estimates (loop);
3608
3609   auto_vector_modes vector_modes;
3610   /* Autodetect first vector size we try.  */
3611   vector_modes.safe_push (VOIDmode);
3612   unsigned int autovec_flags
3613     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3614                                                     loop->simdlen != 0);
3615   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3616                              && !unlimited_cost_model (loop));
3617   machine_mode autodetected_vector_mode = VOIDmode;
3618   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3619   unsigned int mode_i = 0;
3620   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3621
3622   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3623      a mode has not been analyzed.  */
3624   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3625   for (unsigned i = 0; i < vector_modes.length (); ++i)
3626     cached_vf_per_mode.safe_push (0);
3627
3628   /* First determine the main loop vectorization mode, either the first
3629      one that works, starting with auto-detecting the vector mode and then
3630      following the targets order of preference, or the one with the
3631      lowest cost if pick_lowest_cost_p.  */
3632   while (1)
3633     {
3634       bool fatal;
3635       unsigned int last_mode_i = mode_i;
3636       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3637          failed.  */
3638       cached_vf_per_mode[last_mode_i] = -1;
3639       opt_loop_vec_info loop_vinfo
3640         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3641                                NULL, vector_modes, mode_i,
3642                                autodetected_vector_mode, fatal);
3643       if (fatal)
3644         break;
3645
3646       if (loop_vinfo)
3647         {
3648           /*  Analyzis has been successful so update the VF value.  The
3649               VF should always be a multiple of unroll_factor and we want to
3650               capture the original VF here.  */
3651           cached_vf_per_mode[last_mode_i]
3652             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3653                          loop_vinfo->suggested_unroll_factor);
3654           /* Once we hit the desired simdlen for the first time,
3655              discard any previous attempts.  */
3656           if (simdlen
3657               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3658             {
3659               delete first_loop_vinfo;
3660               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3661               simdlen = 0;
3662             }
3663           else if (pick_lowest_cost_p
3664                    && first_loop_vinfo
3665                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3666             {
3667               /* Pick loop_vinfo over first_loop_vinfo.  */
3668               delete first_loop_vinfo;
3669               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3670             }
3671           if (first_loop_vinfo == NULL)
3672             first_loop_vinfo = loop_vinfo;
3673           else
3674             {
3675               delete loop_vinfo;
3676               loop_vinfo = opt_loop_vec_info::success (NULL);
3677             }
3678
3679           /* Commit to first_loop_vinfo if we have no reason to try
3680              alternatives.  */
3681           if (!simdlen && !pick_lowest_cost_p)
3682             break;
3683         }
3684       if (mode_i == vector_modes.length ()
3685           || autodetected_vector_mode == VOIDmode)
3686         break;
3687
3688       /* Try the next biggest vector size.  */
3689       if (dump_enabled_p ())
3690         dump_printf_loc (MSG_NOTE, vect_location,
3691                          "***** Re-trying analysis with vector mode %s\n",
3692                          GET_MODE_NAME (vector_modes[mode_i]));
3693     }
3694   if (!first_loop_vinfo)
3695     return opt_loop_vec_info::propagate_failure (res);
3696
3697   if (dump_enabled_p ())
3698     dump_printf_loc (MSG_NOTE, vect_location,
3699                      "***** Choosing vector mode %s\n",
3700                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3701
3702   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3703      enabled, SIMDUID is not set, it is the innermost loop and we have
3704      either already found the loop's SIMDLEN or there was no SIMDLEN to
3705      begin with.
3706      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3707   bool vect_epilogues = (!simdlen
3708                          && loop->inner == NULL
3709                          && param_vect_epilogues_nomask
3710                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3711                            /* No code motion support for multiple epilogues so for now
3712                               not supported when multiple exits.  */
3713                          && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3714                          && !loop->simduid);
3715   if (!vect_epilogues)
3716     return first_loop_vinfo;
3717
3718   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3719   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3720
3721   /* For epilogues start the analysis from the first mode.  The motivation
3722      behind starting from the beginning comes from cases where the VECTOR_MODES
3723      array may contain length-agnostic and length-specific modes.  Their
3724      ordering is not guaranteed, so we could end up picking a mode for the main
3725      loop that is after the epilogue's optimal mode.  */
3726   vector_modes[0] = autodetected_vector_mode;
3727   mode_i = 0;
3728
3729   bool supports_partial_vectors =
3730     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3731   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3732
3733   while (1)
3734     {
3735       /* If the target does not support partial vectors we can shorten the
3736          number of modes to analyze for the epilogue as we know we can't pick a
3737          mode that would lead to a VF at least as big as the
3738          FIRST_VINFO_VF.  */
3739       if (!supports_partial_vectors
3740           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3741         {
3742           mode_i++;
3743           if (mode_i == vector_modes.length ())
3744             break;
3745           continue;
3746         }
3747
3748       if (dump_enabled_p ())
3749         dump_printf_loc (MSG_NOTE, vect_location,
3750                          "***** Re-trying epilogue analysis with vector "
3751                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3752
3753       bool fatal;
3754       opt_loop_vec_info loop_vinfo
3755         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3756                                first_loop_vinfo,
3757                                vector_modes, mode_i,
3758                                autodetected_vector_mode, fatal);
3759       if (fatal)
3760         break;
3761
3762       if (loop_vinfo)
3763         {
3764           if (pick_lowest_cost_p)
3765             {
3766               /* Keep trying to roll back vectorization attempts while the
3767                  loop_vec_infos they produced were worse than this one.  */
3768               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3769               while (!vinfos.is_empty ()
3770                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3771                 {
3772                   gcc_assert (vect_epilogues);
3773                   delete vinfos.pop ();
3774                 }
3775             }
3776           /* For now only allow one epilogue loop.  */
3777           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3778             {
3779               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3780               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3781               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3782                           || maybe_ne (lowest_th, 0U));
3783               /* Keep track of the known smallest versioning
3784                  threshold.  */
3785               if (ordered_p (lowest_th, th))
3786                 lowest_th = ordered_min (lowest_th, th);
3787             }
3788           else
3789             {
3790               delete loop_vinfo;
3791               loop_vinfo = opt_loop_vec_info::success (NULL);
3792             }
3793
3794           /* For now only allow one epilogue loop, but allow
3795              pick_lowest_cost_p to replace it, so commit to the
3796              first epilogue if we have no reason to try alternatives.  */
3797           if (!pick_lowest_cost_p)
3798             break;
3799         }
3800
3801       if (mode_i == vector_modes.length ())
3802         break;
3803
3804     }
3805
3806   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3807     {
3808       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3809       if (dump_enabled_p ())
3810         dump_printf_loc (MSG_NOTE, vect_location,
3811                          "***** Choosing epilogue vector mode %s\n",
3812                          GET_MODE_NAME
3813                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3814     }
3815
3816   return first_loop_vinfo;
3817 }
3818
3819 /* Return true if there is an in-order reduction function for CODE, storing
3820    it in *REDUC_FN if so.  */
3821
3822 static bool
3823 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3824 {
3825   /* We support MINUS_EXPR by negating the operand.  This also preserves an
3826      initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3827      (-0.0) = -0.0.  */
3828   if (code == PLUS_EXPR || code == MINUS_EXPR)
3829     {
3830       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3831       return true;
3832     }
3833   return false;
3834 }
3835
3836 /* Function reduction_fn_for_scalar_code
3837
3838    Input:
3839    CODE - tree_code of a reduction operations.
3840
3841    Output:
3842    REDUC_FN - the corresponding internal function to be used to reduce the
3843       vector of partial results into a single scalar result, or IFN_LAST
3844       if the operation is a supported reduction operation, but does not have
3845       such an internal function.
3846
3847    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3848
3849 bool
3850 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3851 {
3852   if (code.is_tree_code ())
3853     switch (tree_code (code))
3854       {
3855       case MAX_EXPR:
3856         *reduc_fn = IFN_REDUC_MAX;
3857         return true;
3858
3859       case MIN_EXPR:
3860         *reduc_fn = IFN_REDUC_MIN;
3861         return true;
3862
3863       case PLUS_EXPR:
3864         *reduc_fn = IFN_REDUC_PLUS;
3865         return true;
3866
3867       case BIT_AND_EXPR:
3868         *reduc_fn = IFN_REDUC_AND;
3869         return true;
3870
3871       case BIT_IOR_EXPR:
3872         *reduc_fn = IFN_REDUC_IOR;
3873         return true;
3874
3875       case BIT_XOR_EXPR:
3876         *reduc_fn = IFN_REDUC_XOR;
3877         return true;
3878
3879       case MULT_EXPR:
3880       case MINUS_EXPR:
3881         *reduc_fn = IFN_LAST;
3882         return true;
3883
3884       default:
3885         return false;
3886       }
3887   else
3888     switch (combined_fn (code))
3889       {
3890       CASE_CFN_FMAX:
3891         *reduc_fn = IFN_REDUC_FMAX;
3892         return true;
3893
3894       CASE_CFN_FMIN:
3895         *reduc_fn = IFN_REDUC_FMIN;
3896         return true;
3897
3898       default:
3899         return false;
3900       }
3901 }
3902
3903 /* If there is a neutral value X such that a reduction would not be affected
3904    by the introduction of additional X elements, return that X, otherwise
3905    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3906    of the scalar elements.  If the reduction has just a single initial value
3907    then INITIAL_VALUE is that value, otherwise it is null.
3908    If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3909    In that case no signed zero is returned.  */
3910
3911 tree
3912 neutral_op_for_reduction (tree scalar_type, code_helper code,
3913                           tree initial_value, bool as_initial)
3914 {
3915   if (code.is_tree_code ())
3916     switch (tree_code (code))
3917       {
3918       case DOT_PROD_EXPR:
3919       case SAD_EXPR:
3920       case MINUS_EXPR:
3921       case BIT_IOR_EXPR:
3922       case BIT_XOR_EXPR:
3923         return build_zero_cst (scalar_type);
3924       case WIDEN_SUM_EXPR:
3925       case PLUS_EXPR:
3926         if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3927           return build_real (scalar_type, dconstm0);
3928         else
3929           return build_zero_cst (scalar_type);
3930
3931       case MULT_EXPR:
3932         return build_one_cst (scalar_type);
3933
3934       case BIT_AND_EXPR:
3935         return build_all_ones_cst (scalar_type);
3936
3937       case MAX_EXPR:
3938       case MIN_EXPR:
3939         return initial_value;
3940
3941       default:
3942         return NULL_TREE;
3943       }
3944   else
3945     switch (combined_fn (code))
3946       {
3947       CASE_CFN_FMIN:
3948       CASE_CFN_FMAX:
3949         return initial_value;
3950
3951       default:
3952         return NULL_TREE;
3953       }
3954 }
3955
3956 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3957    STMT is printed with a message MSG. */
3958
3959 static void
3960 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3961 {
3962   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3963 }
3964
3965 /* Return true if we need an in-order reduction for operation CODE
3966    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3967    overflow must wrap.  */
3968
3969 bool
3970 needs_fold_left_reduction_p (tree type, code_helper code)
3971 {
3972   /* CHECKME: check for !flag_finite_math_only too?  */
3973   if (SCALAR_FLOAT_TYPE_P (type))
3974     {
3975       if (code.is_tree_code ())
3976         switch (tree_code (code))
3977           {
3978           case MIN_EXPR:
3979           case MAX_EXPR:
3980             return false;
3981
3982           default:
3983             return !flag_associative_math;
3984           }
3985       else
3986         switch (combined_fn (code))
3987           {
3988           CASE_CFN_FMIN:
3989           CASE_CFN_FMAX:
3990             return false;
3991
3992           default:
3993             return !flag_associative_math;
3994           }
3995     }
3996
3997   if (INTEGRAL_TYPE_P (type))
3998     return (!code.is_tree_code ()
3999             || !operation_no_trapping_overflow (type, tree_code (code)));
4000
4001   if (SAT_FIXED_POINT_TYPE_P (type))
4002     return true;
4003
4004   return false;
4005 }
4006
4007 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
4008    has a handled computation expression.  Store the main reduction
4009    operation in *CODE.  */
4010
4011 static bool
4012 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4013                       tree loop_arg, code_helper *code,
4014                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
4015 {
4016   auto_bitmap visited;
4017   tree lookfor = PHI_RESULT (phi);
4018   ssa_op_iter curri;
4019   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
4020   while (USE_FROM_PTR (curr) != loop_arg)
4021     curr = op_iter_next_use (&curri);
4022   curri.i = curri.numops;
4023   do
4024     {
4025       path.safe_push (std::make_pair (curri, curr));
4026       tree use = USE_FROM_PTR (curr);
4027       if (use == lookfor)
4028         break;
4029       gimple *def = SSA_NAME_DEF_STMT (use);
4030       if (gimple_nop_p (def)
4031           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
4032         {
4033 pop:
4034           do
4035             {
4036               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
4037               curri = x.first;
4038               curr = x.second;
4039               do
4040                 curr = op_iter_next_use (&curri);
4041               /* Skip already visited or non-SSA operands (from iterating
4042                  over PHI args).  */
4043               while (curr != NULL_USE_OPERAND_P
4044                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4045                          || ! bitmap_set_bit (visited,
4046                                               SSA_NAME_VERSION
4047                                                 (USE_FROM_PTR (curr)))));
4048             }
4049           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
4050           if (curr == NULL_USE_OPERAND_P)
4051             break;
4052         }
4053       else
4054         {
4055           if (gimple_code (def) == GIMPLE_PHI)
4056             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4057           else
4058             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4059           while (curr != NULL_USE_OPERAND_P
4060                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4061                      || ! bitmap_set_bit (visited,
4062                                           SSA_NAME_VERSION
4063                                             (USE_FROM_PTR (curr)))))
4064             curr = op_iter_next_use (&curri);
4065           if (curr == NULL_USE_OPERAND_P)
4066             goto pop;
4067         }
4068     }
4069   while (1);
4070   if (dump_file && (dump_flags & TDF_DETAILS))
4071     {
4072       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4073       unsigned i;
4074       std::pair<ssa_op_iter, use_operand_p> *x;
4075       FOR_EACH_VEC_ELT (path, i, x)
4076         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4077       dump_printf (MSG_NOTE, "\n");
4078     }
4079
4080   /* Check whether the reduction path detected is valid.  */
4081   bool fail = path.length () == 0;
4082   bool neg = false;
4083   int sign = -1;
4084   *code = ERROR_MARK;
4085   for (unsigned i = 1; i < path.length (); ++i)
4086     {
4087       gimple *use_stmt = USE_STMT (path[i].second);
4088       gimple_match_op op;
4089       if (!gimple_extract_op (use_stmt, &op))
4090         {
4091           fail = true;
4092           break;
4093         }
4094       unsigned int opi = op.num_ops;
4095       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4096         {
4097           /* The following make sure we can compute the operand index
4098              easily plus it mostly disallows chaining via COND_EXPR condition
4099              operands.  */
4100           for (opi = 0; opi < op.num_ops; ++opi)
4101             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4102               break;
4103         }
4104       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4105         {
4106           for (opi = 0; opi < op.num_ops; ++opi)
4107             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4108               break;
4109         }
4110       if (opi == op.num_ops)
4111         {
4112           fail = true;
4113           break;
4114         }
4115       op.code = canonicalize_code (op.code, op.type);
4116       if (op.code == MINUS_EXPR)
4117         {
4118           op.code = PLUS_EXPR;
4119           /* Track whether we negate the reduction value each iteration.  */
4120           if (op.ops[1] == op.ops[opi])
4121             neg = ! neg;
4122         }
4123       else if (op.code == IFN_COND_SUB)
4124         {
4125           op.code = IFN_COND_ADD;
4126           /* Track whether we negate the reduction value each iteration.  */
4127           if (op.ops[2] == op.ops[opi])
4128             neg = ! neg;
4129         }
4130       if (CONVERT_EXPR_CODE_P (op.code)
4131           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4132         ;
4133       else if (*code == ERROR_MARK)
4134         {
4135           *code = op.code;
4136           sign = TYPE_SIGN (op.type);
4137         }
4138       else if (op.code != *code)
4139         {
4140           fail = true;
4141           break;
4142         }
4143       else if ((op.code == MIN_EXPR
4144                 || op.code == MAX_EXPR)
4145                && sign != TYPE_SIGN (op.type))
4146         {
4147           fail = true;
4148           break;
4149         }
4150       /* Check there's only a single stmt the op is used on.  For the
4151          not value-changing tail and the last stmt allow out-of-loop uses.
4152          ???  We could relax this and handle arbitrary live stmts by
4153          forcing a scalar epilogue for example.  */
4154       imm_use_iterator imm_iter;
4155       use_operand_p use_p;
4156       gimple *op_use_stmt;
4157       unsigned cnt = 0;
4158       bool cond_fn_p = op.code.is_internal_fn ()
4159         && (conditional_internal_fn_code (internal_fn (op.code))
4160             != ERROR_MARK);
4161
4162       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4163         {
4164         /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4165            op1 twice (once as definition, once as else) in the same operation.
4166            Allow this.  */
4167           if (cond_fn_p && op_use_stmt == use_stmt)
4168             {
4169               gcall *call = as_a<gcall *> (use_stmt);
4170               unsigned else_pos
4171                 = internal_fn_else_index (internal_fn (op.code));
4172
4173               for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4174                 {
4175                   if (j == else_pos)
4176                     continue;
4177                   if (gimple_call_arg (call, j) == op.ops[opi])
4178                     cnt++;
4179                 }
4180             }
4181           else if (!is_gimple_debug (op_use_stmt)
4182                    && (*code != ERROR_MARK
4183                        || flow_bb_inside_loop_p (loop,
4184                                                  gimple_bb (op_use_stmt))))
4185             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4186               cnt++;
4187         }
4188
4189       if (cnt != 1)
4190         {
4191           fail = true;
4192           break;
4193         }
4194     }
4195   return ! fail && ! neg && *code != ERROR_MARK;
4196 }
4197
4198 bool
4199 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4200                       tree loop_arg, enum tree_code code)
4201 {
4202   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4203   code_helper code_;
4204   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4205           && code_ == code);
4206 }
4207
4208
4209
4210 /* Function vect_is_simple_reduction
4211
4212    (1) Detect a cross-iteration def-use cycle that represents a simple
4213    reduction computation.  We look for the following pattern:
4214
4215    loop_header:
4216      a1 = phi < a0, a2 >
4217      a3 = ...
4218      a2 = operation (a3, a1)
4219
4220    or
4221
4222    a3 = ...
4223    loop_header:
4224      a1 = phi < a0, a2 >
4225      a2 = operation (a3, a1)
4226
4227    such that:
4228    1. operation is commutative and associative and it is safe to
4229       change the order of the computation
4230    2. no uses for a2 in the loop (a2 is used out of the loop)
4231    3. no uses of a1 in the loop besides the reduction operation
4232    4. no uses of a1 outside the loop.
4233
4234    Conditions 1,4 are tested here.
4235    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4236
4237    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4238    nested cycles.
4239
4240    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4241    reductions:
4242
4243      a1 = phi < a0, a2 >
4244      inner loop (def of a3)
4245      a2 = phi < a3 >
4246
4247    (4) Detect condition expressions, ie:
4248      for (int i = 0; i < N; i++)
4249        if (a[i] < val)
4250         ret_val = a[i];
4251
4252 */
4253
4254 static stmt_vec_info
4255 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4256                           bool *double_reduc, bool *reduc_chain_p, bool slp)
4257 {
4258   gphi *phi = as_a <gphi *> (phi_info->stmt);
4259   gimple *phi_use_stmt = NULL;
4260   imm_use_iterator imm_iter;
4261   use_operand_p use_p;
4262
4263   *double_reduc = false;
4264   *reduc_chain_p = false;
4265   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4266
4267   tree phi_name = PHI_RESULT (phi);
4268   /* ???  If there are no uses of the PHI result the inner loop reduction
4269      won't be detected as possibly double-reduction by vectorizable_reduction
4270      because that tries to walk the PHI arg from the preheader edge which
4271      can be constant.  See PR60382.  */
4272   if (has_zero_uses (phi_name))
4273     return NULL;
4274   class loop *loop = (gimple_bb (phi))->loop_father;
4275   unsigned nphi_def_loop_uses = 0;
4276   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4277     {
4278       gimple *use_stmt = USE_STMT (use_p);
4279       if (is_gimple_debug (use_stmt))
4280         continue;
4281
4282       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4283         {
4284           if (dump_enabled_p ())
4285             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4286                              "intermediate value used outside loop.\n");
4287
4288           return NULL;
4289         }
4290
4291       /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4292          op1 twice (once as definition, once as else) in the same operation.
4293          Only count it as one. */
4294       if (use_stmt != phi_use_stmt)
4295         {
4296           nphi_def_loop_uses++;
4297           phi_use_stmt = use_stmt;
4298         }
4299     }
4300
4301   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4302   if (TREE_CODE (latch_def) != SSA_NAME)
4303     {
4304       if (dump_enabled_p ())
4305         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4306                          "reduction: not ssa_name: %T\n", latch_def);
4307       return NULL;
4308     }
4309
4310   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4311   if (!def_stmt_info
4312       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4313     return NULL;
4314
4315   bool nested_in_vect_loop
4316     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4317   unsigned nlatch_def_loop_uses = 0;
4318   auto_vec<gphi *, 3> lcphis;
4319   bool inner_loop_of_double_reduc = false;
4320   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4321     {
4322       gimple *use_stmt = USE_STMT (use_p);
4323       if (is_gimple_debug (use_stmt))
4324         continue;
4325       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4326         nlatch_def_loop_uses++;
4327       else
4328         {
4329           /* We can have more than one loop-closed PHI.  */
4330           lcphis.safe_push (as_a <gphi *> (use_stmt));
4331           if (nested_in_vect_loop
4332               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4333                   == vect_double_reduction_def))
4334             inner_loop_of_double_reduc = true;
4335         }
4336     }
4337
4338   /* If we are vectorizing an inner reduction we are executing that
4339      in the original order only in case we are not dealing with a
4340      double reduction.  */
4341   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4342     {
4343       if (dump_enabled_p ())
4344         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4345                         "detected nested cycle: ");
4346       return def_stmt_info;
4347     }
4348
4349   /* When the inner loop of a double reduction ends up with more than
4350      one loop-closed PHI we have failed to classify alternate such
4351      PHIs as double reduction, leading to wrong code.  See PR103237.  */
4352   if (inner_loop_of_double_reduc && lcphis.length () != 1)
4353     {
4354       if (dump_enabled_p ())
4355         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4356                          "unhandle double reduction\n");
4357       return NULL;
4358     }
4359
4360   /* If this isn't a nested cycle or if the nested cycle reduction value
4361      is used ouside of the inner loop we cannot handle uses of the reduction
4362      value.  */
4363   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4364     {
4365       if (dump_enabled_p ())
4366         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4367                          "reduction used in loop.\n");
4368       return NULL;
4369     }
4370
4371   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4372      defined in the inner loop.  */
4373   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4374     {
4375       tree op1 = PHI_ARG_DEF (def_stmt, 0);
4376       if (gimple_phi_num_args (def_stmt) != 1
4377           || TREE_CODE (op1) != SSA_NAME)
4378         {
4379           if (dump_enabled_p ())
4380             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4381                              "unsupported phi node definition.\n");
4382
4383           return NULL;
4384         }
4385
4386       /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4387          and the latch definition op1.  */
4388       gimple *def1 = SSA_NAME_DEF_STMT (op1);
4389       if (gimple_bb (def1)
4390           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4391           && loop->inner
4392           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4393           && (is_gimple_assign (def1) || is_gimple_call (def1))
4394           && is_a <gphi *> (phi_use_stmt)
4395           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4396           && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4397                                             loop_latch_edge (loop->inner))))
4398         {
4399           if (dump_enabled_p ())
4400             report_vect_op (MSG_NOTE, def_stmt,
4401                             "detected double reduction: ");
4402
4403           *double_reduc = true;
4404           return def_stmt_info;
4405         }
4406
4407       return NULL;
4408     }
4409
4410   /* Look for the expression computing latch_def from then loop PHI result.  */
4411   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4412   code_helper code;
4413   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4414                             path))
4415     {
4416       STMT_VINFO_REDUC_CODE (phi_info) = code;
4417       if (code == COND_EXPR && !nested_in_vect_loop)
4418         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4419
4420       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4421          reduction chain for which the additional restriction is that
4422          all operations in the chain are the same.  */
4423       auto_vec<stmt_vec_info, 8> reduc_chain;
4424       unsigned i;
4425       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4426       for (i = path.length () - 1; i >= 1; --i)
4427         {
4428           gimple *stmt = USE_STMT (path[i].second);
4429           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4430           gimple_match_op op;
4431           if (!gimple_extract_op (stmt, &op))
4432             gcc_unreachable ();
4433           if (gassign *assign = dyn_cast<gassign *> (stmt))
4434             STMT_VINFO_REDUC_IDX (stmt_info)
4435               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4436           else
4437             {
4438               gcall *call = as_a<gcall *> (stmt);
4439               STMT_VINFO_REDUC_IDX (stmt_info)
4440                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4441             }
4442           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4443                                      && (i == 1 || i == path.length () - 1));
4444           if ((op.code != code && !leading_conversion)
4445               /* We can only handle the final value in epilogue
4446                  generation for reduction chains.  */
4447               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4448             is_slp_reduc = false;
4449           /* For reduction chains we support a trailing/leading
4450              conversions.  We do not store those in the actual chain.  */
4451           if (leading_conversion)
4452             continue;
4453           reduc_chain.safe_push (stmt_info);
4454         }
4455       if (slp && is_slp_reduc && reduc_chain.length () > 1)
4456         {
4457           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4458             {
4459               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4460               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4461             }
4462           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4463           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4464
4465           /* Save the chain for further analysis in SLP detection.  */
4466           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4467           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4468
4469           *reduc_chain_p = true;
4470           if (dump_enabled_p ())
4471             dump_printf_loc (MSG_NOTE, vect_location,
4472                             "reduction: detected reduction chain\n");
4473         }
4474       else if (dump_enabled_p ())
4475         dump_printf_loc (MSG_NOTE, vect_location,
4476                          "reduction: detected reduction\n");
4477
4478       return def_stmt_info;
4479     }
4480
4481   if (dump_enabled_p ())
4482     dump_printf_loc (MSG_NOTE, vect_location,
4483                      "reduction: unknown pattern\n");
4484
4485   return NULL;
4486 }
4487
4488 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4489    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4490    or -1 if not known.  */
4491
4492 static int
4493 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4494 {
4495   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4496   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4497     {
4498       if (dump_enabled_p ())
4499         dump_printf_loc (MSG_NOTE, vect_location,
4500                          "cost model: epilogue peel iters set to vf/2 "
4501                          "because loop iterations are unknown .\n");
4502       return assumed_vf / 2;
4503     }
4504   else
4505     {
4506       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4507       peel_iters_prologue = MIN (niters, peel_iters_prologue);
4508       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4509       /* If we need to peel for gaps, but no peeling is required, we have to
4510          peel VF iterations.  */
4511       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4512         peel_iters_epilogue = assumed_vf;
4513       return peel_iters_epilogue;
4514     }
4515 }
4516
4517 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
4518 int
4519 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4520                              int *peel_iters_epilogue,
4521                              stmt_vector_for_cost *scalar_cost_vec,
4522                              stmt_vector_for_cost *prologue_cost_vec,
4523                              stmt_vector_for_cost *epilogue_cost_vec)
4524 {
4525   int retval = 0;
4526
4527   *peel_iters_epilogue
4528     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4529
4530   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4531     {
4532       /* If peeled iterations are known but number of scalar loop
4533          iterations are unknown, count a taken branch per peeled loop.  */
4534       if (peel_iters_prologue > 0)
4535         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4536                                    vect_prologue);
4537       if (*peel_iters_epilogue > 0)
4538         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4539                                     vect_epilogue);
4540     }
4541
4542   stmt_info_for_cost *si;
4543   int j;
4544   if (peel_iters_prologue)
4545     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4546       retval += record_stmt_cost (prologue_cost_vec,
4547                                   si->count * peel_iters_prologue,
4548                                   si->kind, si->stmt_info, si->misalign,
4549                                   vect_prologue);
4550   if (*peel_iters_epilogue)
4551     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4552       retval += record_stmt_cost (epilogue_cost_vec,
4553                                   si->count * *peel_iters_epilogue,
4554                                   si->kind, si->stmt_info, si->misalign,
4555                                   vect_epilogue);
4556
4557   return retval;
4558 }
4559
4560 /* Function vect_estimate_min_profitable_iters
4561
4562    Return the number of iterations required for the vector version of the
4563    loop to be profitable relative to the cost of the scalar version of the
4564    loop.
4565
4566    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4567    of iterations for vectorization.  -1 value means loop vectorization
4568    is not profitable.  This returned value may be used for dynamic
4569    profitability check.
4570
4571    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4572    for static check against estimated number of iterations.  */
4573
4574 static void
4575 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4576                                     int *ret_min_profitable_niters,
4577                                     int *ret_min_profitable_estimate,
4578                                     unsigned *suggested_unroll_factor)
4579 {
4580   int min_profitable_iters;
4581   int min_profitable_estimate;
4582   int peel_iters_prologue;
4583   int peel_iters_epilogue;
4584   unsigned vec_inside_cost = 0;
4585   int vec_outside_cost = 0;
4586   unsigned vec_prologue_cost = 0;
4587   unsigned vec_epilogue_cost = 0;
4588   int scalar_single_iter_cost = 0;
4589   int scalar_outside_cost = 0;
4590   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4591   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4592   vector_costs *target_cost_data = loop_vinfo->vector_costs;
4593
4594   /* Cost model disabled.  */
4595   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4596     {
4597       if (dump_enabled_p ())
4598         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4599       *ret_min_profitable_niters = 0;
4600       *ret_min_profitable_estimate = 0;
4601       return;
4602     }
4603
4604   /* Requires loop versioning tests to handle misalignment.  */
4605   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4606     {
4607       /*  FIXME: Make cost depend on complexity of individual check.  */
4608       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4609       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4610       if (dump_enabled_p ())
4611         dump_printf (MSG_NOTE,
4612                      "cost model: Adding cost of checks for loop "
4613                      "versioning to treat misalignment.\n");
4614     }
4615
4616   /* Requires loop versioning with alias checks.  */
4617   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4618     {
4619       /*  FIXME: Make cost depend on complexity of individual check.  */
4620       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4621       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4622       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4623       if (len)
4624         /* Count LEN - 1 ANDs and LEN comparisons.  */
4625         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4626                               scalar_stmt, vect_prologue);
4627       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4628       if (len)
4629         {
4630           /* Count LEN - 1 ANDs and LEN comparisons.  */
4631           unsigned int nstmts = len * 2 - 1;
4632           /* +1 for each bias that needs adding.  */
4633           for (unsigned int i = 0; i < len; ++i)
4634             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4635               nstmts += 1;
4636           (void) add_stmt_cost (target_cost_data, nstmts,
4637                                 scalar_stmt, vect_prologue);
4638         }
4639       if (dump_enabled_p ())
4640         dump_printf (MSG_NOTE,
4641                      "cost model: Adding cost of checks for loop "
4642                      "versioning aliasing.\n");
4643     }
4644
4645   /* Requires loop versioning with niter checks.  */
4646   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4647     {
4648       /*  FIXME: Make cost depend on complexity of individual check.  */
4649       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4650                             NULL, NULL, NULL_TREE, 0, vect_prologue);
4651       if (dump_enabled_p ())
4652         dump_printf (MSG_NOTE,
4653                      "cost model: Adding cost of checks for loop "
4654                      "versioning niters.\n");
4655     }
4656
4657   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4658     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4659                           vect_prologue);
4660
4661   /* Count statements in scalar loop.  Using this as scalar cost for a single
4662      iteration for now.
4663
4664      TODO: Add outer loop support.
4665
4666      TODO: Consider assigning different costs to different scalar
4667      statements.  */
4668
4669   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4670
4671   /* Add additional cost for the peeled instructions in prologue and epilogue
4672      loop.  (For fully-masked loops there will be no peeling.)
4673
4674      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4675      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4676
4677      TODO: Build an expression that represents peel_iters for prologue and
4678      epilogue to be used in a run-time test.  */
4679
4680   bool prologue_need_br_taken_cost = false;
4681   bool prologue_need_br_not_taken_cost = false;
4682
4683   /* Calculate peel_iters_prologue.  */
4684   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4685     peel_iters_prologue = 0;
4686   else if (npeel < 0)
4687     {
4688       peel_iters_prologue = assumed_vf / 2;
4689       if (dump_enabled_p ())
4690         dump_printf (MSG_NOTE, "cost model: "
4691                      "prologue peel iters set to vf/2.\n");
4692
4693       /* If peeled iterations are unknown, count a taken branch and a not taken
4694          branch per peeled loop.  Even if scalar loop iterations are known,
4695          vector iterations are not known since peeled prologue iterations are
4696          not known.  Hence guards remain the same.  */
4697       prologue_need_br_taken_cost = true;
4698       prologue_need_br_not_taken_cost = true;
4699     }
4700   else
4701     {
4702       peel_iters_prologue = npeel;
4703       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4704         /* If peeled iterations are known but number of scalar loop
4705            iterations are unknown, count a taken branch per peeled loop.  */
4706         prologue_need_br_taken_cost = true;
4707     }
4708
4709   bool epilogue_need_br_taken_cost = false;
4710   bool epilogue_need_br_not_taken_cost = false;
4711
4712   /* Calculate peel_iters_epilogue.  */
4713   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4714     /* We need to peel exactly one iteration for gaps.  */
4715     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4716   else if (npeel < 0)
4717     {
4718       /* If peeling for alignment is unknown, loop bound of main loop
4719          becomes unknown.  */
4720       peel_iters_epilogue = assumed_vf / 2;
4721       if (dump_enabled_p ())
4722         dump_printf (MSG_NOTE, "cost model: "
4723                      "epilogue peel iters set to vf/2 because "
4724                      "peeling for alignment is unknown.\n");
4725
4726       /* See the same reason above in peel_iters_prologue calculation.  */
4727       epilogue_need_br_taken_cost = true;
4728       epilogue_need_br_not_taken_cost = true;
4729     }
4730   else
4731     {
4732       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4733       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4734         /* If peeled iterations are known but number of scalar loop
4735            iterations are unknown, count a taken branch per peeled loop.  */
4736         epilogue_need_br_taken_cost = true;
4737     }
4738
4739   stmt_info_for_cost *si;
4740   int j;
4741   /* Add costs associated with peel_iters_prologue.  */
4742   if (peel_iters_prologue)
4743     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4744       {
4745         (void) add_stmt_cost (target_cost_data,
4746                               si->count * peel_iters_prologue, si->kind,
4747                               si->stmt_info, si->node, si->vectype,
4748                               si->misalign, vect_prologue);
4749       }
4750
4751   /* Add costs associated with peel_iters_epilogue.  */
4752   if (peel_iters_epilogue)
4753     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4754       {
4755         (void) add_stmt_cost (target_cost_data,
4756                               si->count * peel_iters_epilogue, si->kind,
4757                               si->stmt_info, si->node, si->vectype,
4758                               si->misalign, vect_epilogue);
4759       }
4760
4761   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4762
4763   if (prologue_need_br_taken_cost)
4764     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4765                           vect_prologue);
4766
4767   if (prologue_need_br_not_taken_cost)
4768     (void) add_stmt_cost (target_cost_data, 1,
4769                           cond_branch_not_taken, vect_prologue);
4770
4771   if (epilogue_need_br_taken_cost)
4772     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4773                           vect_epilogue);
4774
4775   if (epilogue_need_br_not_taken_cost)
4776     (void) add_stmt_cost (target_cost_data, 1,
4777                           cond_branch_not_taken, vect_epilogue);
4778
4779   /* Take care of special costs for rgroup controls of partial vectors.  */
4780   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4781       && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4782           == vect_partial_vectors_avx512))
4783     {
4784       /* Calculate how many masks we need to generate.  */
4785       unsigned int num_masks = 0;
4786       bool need_saturation = false;
4787       for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4788         if (rgm.type)
4789           {
4790             unsigned nvectors = rgm.factor;
4791             num_masks += nvectors;
4792             if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4793                 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4794               need_saturation = true;
4795           }
4796
4797       /* ???  The target isn't able to identify the costs below as
4798          producing masks so it cannot penaltize cases where we'd run
4799          out of mask registers for example.  */
4800
4801       /* ???  We are also failing to account for smaller vector masks
4802          we generate by splitting larger masks in vect_get_loop_mask.  */
4803
4804       /* In the worst case, we need to generate each mask in the prologue
4805          and in the loop body.  We need one splat per group and one
4806          compare per mask.
4807
4808          Sometimes the prologue mask will fold to a constant,
4809          so the actual prologue cost might be smaller.  However, it's
4810          simpler and safer to use the worst-case cost; if this ends up
4811          being the tie-breaker between vectorizing or not, then it's
4812          probably better not to vectorize.  */
4813       (void) add_stmt_cost (target_cost_data,
4814                             num_masks
4815                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4816                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4817                             vect_prologue);
4818       (void) add_stmt_cost (target_cost_data,
4819                             num_masks
4820                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4821                             vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4822
4823       /* When we need saturation we need it both in the prologue and
4824          the epilogue.  */
4825       if (need_saturation)
4826         {
4827           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4828                                 NULL, NULL, NULL_TREE, 0, vect_prologue);
4829           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4830                                 NULL, NULL, NULL_TREE, 0, vect_body);
4831         }
4832     }
4833   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4834            && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4835                == vect_partial_vectors_while_ult))
4836     {
4837       /* Calculate how many masks we need to generate.  */
4838       unsigned int num_masks = 0;
4839       rgroup_controls *rgm;
4840       unsigned int num_vectors_m1;
4841       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4842                         num_vectors_m1, rgm)
4843         if (rgm->type)
4844           num_masks += num_vectors_m1 + 1;
4845       gcc_assert (num_masks > 0);
4846
4847       /* In the worst case, we need to generate each mask in the prologue
4848          and in the loop body.  One of the loop body mask instructions
4849          replaces the comparison in the scalar loop, and since we don't
4850          count the scalar comparison against the scalar body, we shouldn't
4851          count that vector instruction against the vector body either.
4852
4853          Sometimes we can use unpacks instead of generating prologue
4854          masks and sometimes the prologue mask will fold to a constant,
4855          so the actual prologue cost might be smaller.  However, it's
4856          simpler and safer to use the worst-case cost; if this ends up
4857          being the tie-breaker between vectorizing or not, then it's
4858          probably better not to vectorize.  */
4859       (void) add_stmt_cost (target_cost_data, num_masks,
4860                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4861                             vect_prologue);
4862       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4863                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4864                             vect_body);
4865     }
4866   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4867     {
4868       /* Referring to the functions vect_set_loop_condition_partial_vectors
4869          and vect_set_loop_controls_directly, we need to generate each
4870          length in the prologue and in the loop body if required. Although
4871          there are some possible optimizations, we consider the worst case
4872          here.  */
4873
4874       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4875       signed char partial_load_store_bias
4876         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4877       bool need_iterate_p
4878         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4879            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4880
4881       /* Calculate how many statements to be added.  */
4882       unsigned int prologue_stmts = 0;
4883       unsigned int body_stmts = 0;
4884
4885       rgroup_controls *rgc;
4886       unsigned int num_vectors_m1;
4887       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4888         if (rgc->type)
4889           {
4890             /* May need one SHIFT for nitems_total computation.  */
4891             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4892             if (nitems != 1 && !niters_known_p)
4893               prologue_stmts += 1;
4894
4895             /* May need one MAX and one MINUS for wrap around.  */
4896             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4897               prologue_stmts += 2;
4898
4899             /* Need one MAX and one MINUS for each batch limit excepting for
4900                the 1st one.  */
4901             prologue_stmts += num_vectors_m1 * 2;
4902
4903             unsigned int num_vectors = num_vectors_m1 + 1;
4904
4905             /* Need to set up lengths in prologue, only one MIN required
4906                for each since start index is zero.  */
4907             prologue_stmts += num_vectors;
4908
4909             /* If we have a non-zero partial load bias, we need one PLUS
4910                to adjust the load length.  */
4911             if (partial_load_store_bias != 0)
4912               body_stmts += 1;
4913
4914             unsigned int length_update_cost = 0;
4915             if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4916               /* For decrement IV style, Each only need a single SELECT_VL
4917                  or MIN since beginning to calculate the number of elements
4918                  need to be processed in current iteration.  */
4919               length_update_cost = 1;
4920             else
4921               /* For increment IV stype, Each may need two MINs and one MINUS to
4922                  update lengths in body for next iteration.  */
4923               length_update_cost = 3;
4924
4925             if (need_iterate_p)
4926               body_stmts += length_update_cost * num_vectors;
4927           }
4928
4929       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4930                             scalar_stmt, vect_prologue);
4931       (void) add_stmt_cost (target_cost_data, body_stmts,
4932                             scalar_stmt, vect_body);
4933     }
4934
4935   /* FORNOW: The scalar outside cost is incremented in one of the
4936      following ways:
4937
4938      1. The vectorizer checks for alignment and aliasing and generates
4939      a condition that allows dynamic vectorization.  A cost model
4940      check is ANDED with the versioning condition.  Hence scalar code
4941      path now has the added cost of the versioning check.
4942
4943        if (cost > th & versioning_check)
4944          jmp to vector code
4945
4946      Hence run-time scalar is incremented by not-taken branch cost.
4947
4948      2. The vectorizer then checks if a prologue is required.  If the
4949      cost model check was not done before during versioning, it has to
4950      be done before the prologue check.
4951
4952        if (cost <= th)
4953          prologue = scalar_iters
4954        if (prologue == 0)
4955          jmp to vector code
4956        else
4957          execute prologue
4958        if (prologue == num_iters)
4959          go to exit
4960
4961      Hence the run-time scalar cost is incremented by a taken branch,
4962      plus a not-taken branch, plus a taken branch cost.
4963
4964      3. The vectorizer then checks if an epilogue is required.  If the
4965      cost model check was not done before during prologue check, it
4966      has to be done with the epilogue check.
4967
4968        if (prologue == 0)
4969          jmp to vector code
4970        else
4971          execute prologue
4972        if (prologue == num_iters)
4973          go to exit
4974        vector code:
4975          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4976            jmp to epilogue
4977
4978      Hence the run-time scalar cost should be incremented by 2 taken
4979      branches.
4980
4981      TODO: The back end may reorder the BBS's differently and reverse
4982      conditions/branch directions.  Change the estimates below to
4983      something more reasonable.  */
4984
4985   /* If the number of iterations is known and we do not do versioning, we can
4986      decide whether to vectorize at compile time.  Hence the scalar version
4987      do not carry cost model guard costs.  */
4988   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4989       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4990     {
4991       /* Cost model check occurs at versioning.  */
4992       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4993         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4994       else
4995         {
4996           /* Cost model check occurs at prologue generation.  */
4997           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4998             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4999               + vect_get_stmt_cost (cond_branch_not_taken);
5000           /* Cost model check occurs at epilogue generation.  */
5001           else
5002             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
5003         }
5004     }
5005
5006   /* Complete the target-specific cost calculations.  */
5007   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
5008                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
5009                suggested_unroll_factor);
5010
5011   if (suggested_unroll_factor && *suggested_unroll_factor > 1
5012       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
5013       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
5014                     *suggested_unroll_factor,
5015                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
5016     {
5017       if (dump_enabled_p ())
5018         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5019                          "can't unroll as unrolled vectorization factor larger"
5020                          " than maximum vectorization factor: "
5021                          HOST_WIDE_INT_PRINT_UNSIGNED "\n",
5022                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
5023       *suggested_unroll_factor = 1;
5024     }
5025
5026   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
5027
5028   if (dump_enabled_p ())
5029     {
5030       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5031       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
5032                    vec_inside_cost);
5033       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
5034                    vec_prologue_cost);
5035       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
5036                    vec_epilogue_cost);
5037       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
5038                    scalar_single_iter_cost);
5039       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
5040                    scalar_outside_cost);
5041       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
5042                    vec_outside_cost);
5043       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
5044                    peel_iters_prologue);
5045       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
5046                    peel_iters_epilogue);
5047     }
5048
5049   /* Calculate number of iterations required to make the vector version
5050      profitable, relative to the loop bodies only.  The following condition
5051      must hold true:
5052      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
5053      where
5054      SIC = scalar iteration cost, VIC = vector iteration cost,
5055      VOC = vector outside cost, VF = vectorization factor,
5056      NPEEL = prologue iterations + epilogue iterations,
5057      SOC = scalar outside cost for run time cost model check.  */
5058
5059   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
5060                           - vec_inside_cost);
5061   if (saving_per_viter <= 0)
5062     {
5063       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
5064         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
5065                     "vectorization did not happen for a simd loop");
5066
5067       if (dump_enabled_p ())
5068         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5069                          "cost model: the vector iteration cost = %d "
5070                          "divided by the scalar iteration cost = %d "
5071                          "is greater or equal to the vectorization factor = %d"
5072                          ".\n",
5073                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5074       *ret_min_profitable_niters = -1;
5075       *ret_min_profitable_estimate = -1;
5076       return;
5077     }
5078
5079   /* ??? The "if" arm is written to handle all cases; see below for what
5080      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5081   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5082     {
5083       /* Rewriting the condition above in terms of the number of
5084          vector iterations (vniters) rather than the number of
5085          scalar iterations (niters) gives:
5086
5087          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5088
5089          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5090
5091          For integer N, X and Y when X > 0:
5092
5093          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
5094       int outside_overhead = (vec_outside_cost
5095                               - scalar_single_iter_cost * peel_iters_prologue
5096                               - scalar_single_iter_cost * peel_iters_epilogue
5097                               - scalar_outside_cost);
5098       /* We're only interested in cases that require at least one
5099          vector iteration.  */
5100       int min_vec_niters = 1;
5101       if (outside_overhead > 0)
5102         min_vec_niters = outside_overhead / saving_per_viter + 1;
5103
5104       if (dump_enabled_p ())
5105         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
5106                      min_vec_niters);
5107
5108       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5109         {
5110           /* Now that we know the minimum number of vector iterations,
5111              find the minimum niters for which the scalar cost is larger:
5112
5113              SIC * niters > VIC * vniters + VOC - SOC
5114
5115              We know that the minimum niters is no more than
5116              vniters * VF + NPEEL, but it might be (and often is) less
5117              than that if a partial vector iteration is cheaper than the
5118              equivalent scalar code.  */
5119           int threshold = (vec_inside_cost * min_vec_niters
5120                            + vec_outside_cost
5121                            - scalar_outside_cost);
5122           if (threshold <= 0)
5123             min_profitable_iters = 1;
5124           else
5125             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5126         }
5127       else
5128         /* Convert the number of vector iterations into a number of
5129            scalar iterations.  */
5130         min_profitable_iters = (min_vec_niters * assumed_vf
5131                                 + peel_iters_prologue
5132                                 + peel_iters_epilogue);
5133     }
5134   else
5135     {
5136       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5137                               * assumed_vf
5138                               - vec_inside_cost * peel_iters_prologue
5139                               - vec_inside_cost * peel_iters_epilogue);
5140       if (min_profitable_iters <= 0)
5141         min_profitable_iters = 0;
5142       else
5143         {
5144           min_profitable_iters /= saving_per_viter;
5145
5146           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5147               <= (((int) vec_inside_cost * min_profitable_iters)
5148                   + (((int) vec_outside_cost - scalar_outside_cost)
5149                      * assumed_vf)))
5150             min_profitable_iters++;
5151         }
5152     }
5153
5154   if (dump_enabled_p ())
5155     dump_printf (MSG_NOTE,
5156                  "  Calculated minimum iters for profitability: %d\n",
5157                  min_profitable_iters);
5158
5159   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5160       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5161     /* We want the vectorized loop to execute at least once.  */
5162     min_profitable_iters = assumed_vf + peel_iters_prologue;
5163   else if (min_profitable_iters < peel_iters_prologue)
5164     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5165        vectorized loop executes at least once.  */
5166     min_profitable_iters = peel_iters_prologue;
5167
5168   if (dump_enabled_p ())
5169     dump_printf_loc (MSG_NOTE, vect_location,
5170                      "  Runtime profitability threshold = %d\n",
5171                      min_profitable_iters);
5172
5173   *ret_min_profitable_niters = min_profitable_iters;
5174
5175   /* Calculate number of iterations required to make the vector version
5176      profitable, relative to the loop bodies only.
5177
5178      Non-vectorized variant is SIC * niters and it must win over vector
5179      variant on the expected loop trip count.  The following condition must hold true:
5180      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
5181
5182   if (vec_outside_cost <= 0)
5183     min_profitable_estimate = 0;
5184   /* ??? This "else if" arm is written to handle all cases; see below for
5185      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5186   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5187     {
5188       /* This is a repeat of the code above, but with + SOC rather
5189          than - SOC.  */
5190       int outside_overhead = (vec_outside_cost
5191                               - scalar_single_iter_cost * peel_iters_prologue
5192                               - scalar_single_iter_cost * peel_iters_epilogue
5193                               + scalar_outside_cost);
5194       int min_vec_niters = 1;
5195       if (outside_overhead > 0)
5196         min_vec_niters = outside_overhead / saving_per_viter + 1;
5197
5198       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5199         {
5200           int threshold = (vec_inside_cost * min_vec_niters
5201                            + vec_outside_cost
5202                            + scalar_outside_cost);
5203           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5204         }
5205       else
5206         min_profitable_estimate = (min_vec_niters * assumed_vf
5207                                    + peel_iters_prologue
5208                                    + peel_iters_epilogue);
5209     }
5210   else
5211     {
5212       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5213                                  * assumed_vf
5214                                  - vec_inside_cost * peel_iters_prologue
5215                                  - vec_inside_cost * peel_iters_epilogue)
5216                                  / ((scalar_single_iter_cost * assumed_vf)
5217                                    - vec_inside_cost);
5218     }
5219   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5220   if (dump_enabled_p ())
5221     dump_printf_loc (MSG_NOTE, vect_location,
5222                      "  Static estimate profitability threshold = %d\n",
5223                      min_profitable_estimate);
5224
5225   *ret_min_profitable_estimate = min_profitable_estimate;
5226 }
5227
5228 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5229    vector elements (not bits) for a vector with NELT elements.  */
5230 static void
5231 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5232                               vec_perm_builder *sel)
5233 {
5234   /* The encoding is a single stepped pattern.  Any wrap-around is handled
5235      by vec_perm_indices.  */
5236   sel->new_vector (nelt, 1, 3);
5237   for (unsigned int i = 0; i < 3; i++)
5238     sel->quick_push (i + offset);
5239 }
5240
5241 /* Checks whether the target supports whole-vector shifts for vectors of mode
5242    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
5243    it supports vec_perm_const with masks for all necessary shift amounts.  */
5244 static bool
5245 have_whole_vector_shift (machine_mode mode)
5246 {
5247   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5248     return true;
5249
5250   /* Variable-length vectors should be handled via the optab.  */
5251   unsigned int nelt;
5252   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5253     return false;
5254
5255   vec_perm_builder sel;
5256   vec_perm_indices indices;
5257   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5258     {
5259       calc_vec_perm_mask_for_shift (i, nelt, &sel);
5260       indices.new_vector (sel, 2, nelt);
5261       if (!can_vec_perm_const_p (mode, mode, indices, false))
5262         return false;
5263     }
5264   return true;
5265 }
5266
5267 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5268    multiplication operands have differing signs and (b) we intend
5269    to emulate the operation using a series of signed DOT_PROD_EXPRs.
5270    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
5271
5272 static bool
5273 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5274                                  stmt_vec_info stmt_info)
5275 {
5276   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5277   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5278     return false;
5279
5280   tree rhs1 = gimple_assign_rhs1 (assign);
5281   tree rhs2 = gimple_assign_rhs2 (assign);
5282   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5283     return false;
5284
5285   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5286   gcc_assert (reduc_info->is_reduc_info);
5287   return !directly_supported_p (DOT_PROD_EXPR,
5288                                 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5289                                 optab_vector_mixed_sign);
5290 }
5291
5292 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5293    functions. Design better to avoid maintenance issues.  */
5294
5295 /* Function vect_model_reduction_cost.
5296
5297    Models cost for a reduction operation, including the vector ops
5298    generated within the strip-mine loop in some cases, the initial
5299    definition before the loop, and the epilogue code that must be generated.  */
5300
5301 static void
5302 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5303                            stmt_vec_info stmt_info, internal_fn reduc_fn,
5304                            vect_reduction_type reduction_type,
5305                            int ncopies, stmt_vector_for_cost *cost_vec)
5306 {
5307   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5308   tree vectype;
5309   machine_mode mode;
5310   class loop *loop = NULL;
5311
5312   if (loop_vinfo)
5313     loop = LOOP_VINFO_LOOP (loop_vinfo);
5314
5315   /* Condition reductions generate two reductions in the loop.  */
5316   if (reduction_type == COND_REDUCTION)
5317     ncopies *= 2;
5318
5319   vectype = STMT_VINFO_VECTYPE (stmt_info);
5320   mode = TYPE_MODE (vectype);
5321   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5322
5323   gimple_match_op op;
5324   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5325     gcc_unreachable ();
5326
5327   bool emulated_mixed_dot_prod
5328     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5329   if (reduction_type == EXTRACT_LAST_REDUCTION)
5330     /* No extra instructions are needed in the prologue.  The loop body
5331        operations are costed in vectorizable_condition.  */
5332     inside_cost = 0;
5333   else if (reduction_type == FOLD_LEFT_REDUCTION)
5334     {
5335       /* No extra instructions needed in the prologue.  */
5336       prologue_cost = 0;
5337
5338       if (reduc_fn != IFN_LAST)
5339         /* Count one reduction-like operation per vector.  */
5340         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5341                                         stmt_info, 0, vect_body);
5342       else
5343         {
5344           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
5345           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5346           inside_cost = record_stmt_cost (cost_vec, nelements,
5347                                           vec_to_scalar, stmt_info, 0,
5348                                           vect_body);
5349           inside_cost += record_stmt_cost (cost_vec, nelements,
5350                                            scalar_stmt, stmt_info, 0,
5351                                            vect_body);
5352         }
5353     }
5354   else
5355     {
5356       /* Add in the cost of the initial definitions.  */
5357       int prologue_stmts;
5358       if (reduction_type == COND_REDUCTION)
5359         /* For cond reductions we have four vectors: initial index, step,
5360            initial result of the data reduction, initial value of the index
5361            reduction.  */
5362         prologue_stmts = 4;
5363       else if (emulated_mixed_dot_prod)
5364         /* We need the initial reduction value and two invariants:
5365            one that contains the minimum signed value and one that
5366            contains half of its negative.  */
5367         prologue_stmts = 3;
5368       else
5369         prologue_stmts = 1;
5370       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5371                                          scalar_to_vec, stmt_info, 0,
5372                                          vect_prologue);
5373     }
5374
5375   /* Determine cost of epilogue code.
5376
5377      We have a reduction operator that will reduce the vector in one statement.
5378      Also requires scalar extract.  */
5379
5380   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5381     {
5382       if (reduc_fn != IFN_LAST)
5383         {
5384           if (reduction_type == COND_REDUCTION)
5385             {
5386               /* An EQ stmt and an COND_EXPR stmt.  */
5387               epilogue_cost += record_stmt_cost (cost_vec, 2,
5388                                                  vector_stmt, stmt_info, 0,
5389                                                  vect_epilogue);
5390               /* Reduction of the max index and a reduction of the found
5391                  values.  */
5392               epilogue_cost += record_stmt_cost (cost_vec, 2,
5393                                                  vec_to_scalar, stmt_info, 0,
5394                                                  vect_epilogue);
5395               /* A broadcast of the max value.  */
5396               epilogue_cost += record_stmt_cost (cost_vec, 1,
5397                                                  scalar_to_vec, stmt_info, 0,
5398                                                  vect_epilogue);
5399             }
5400           else
5401             {
5402               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5403                                                  stmt_info, 0, vect_epilogue);
5404               epilogue_cost += record_stmt_cost (cost_vec, 1,
5405                                                  vec_to_scalar, stmt_info, 0,
5406                                                  vect_epilogue);
5407             }
5408         }
5409       else if (reduction_type == COND_REDUCTION)
5410         {
5411           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5412           /* Extraction of scalar elements.  */
5413           epilogue_cost += record_stmt_cost (cost_vec,
5414                                              2 * estimated_nunits,
5415                                              vec_to_scalar, stmt_info, 0,
5416                                              vect_epilogue);
5417           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
5418           epilogue_cost += record_stmt_cost (cost_vec,
5419                                              2 * estimated_nunits - 3,
5420                                              scalar_stmt, stmt_info, 0,
5421                                              vect_epilogue);
5422         }
5423       else if (reduction_type == EXTRACT_LAST_REDUCTION
5424                || reduction_type == FOLD_LEFT_REDUCTION)
5425         /* No extra instructions need in the epilogue.  */
5426         ;
5427       else
5428         {
5429           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5430           tree bitsize = TYPE_SIZE (op.type);
5431           int element_bitsize = tree_to_uhwi (bitsize);
5432           int nelements = vec_size_in_bits / element_bitsize;
5433
5434           if (op.code == COND_EXPR)
5435             op.code = MAX_EXPR;
5436
5437           /* We have a whole vector shift available.  */
5438           if (VECTOR_MODE_P (mode)
5439               && directly_supported_p (op.code, vectype)
5440               && have_whole_vector_shift (mode))
5441             {
5442               /* Final reduction via vector shifts and the reduction operator.
5443                  Also requires scalar extract.  */
5444               epilogue_cost += record_stmt_cost (cost_vec,
5445                                                  exact_log2 (nelements) * 2,
5446                                                  vector_stmt, stmt_info, 0,
5447                                                  vect_epilogue);
5448               epilogue_cost += record_stmt_cost (cost_vec, 1,
5449                                                  vec_to_scalar, stmt_info, 0,
5450                                                  vect_epilogue);
5451             }
5452           else
5453             /* Use extracts and reduction op for final reduction.  For N
5454                elements, we have N extracts and N-1 reduction ops.  */
5455             epilogue_cost += record_stmt_cost (cost_vec,
5456                                                nelements + nelements - 1,
5457                                                vector_stmt, stmt_info, 0,
5458                                                vect_epilogue);
5459         }
5460     }
5461
5462   if (dump_enabled_p ())
5463     dump_printf (MSG_NOTE,
5464                  "vect_model_reduction_cost: inside_cost = %d, "
5465                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5466                  prologue_cost, epilogue_cost);
5467 }
5468
5469 /* SEQ is a sequence of instructions that initialize the reduction
5470    described by REDUC_INFO.  Emit them in the appropriate place.  */
5471
5472 static void
5473 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5474                                 stmt_vec_info reduc_info, gimple *seq)
5475 {
5476   if (reduc_info->reused_accumulator)
5477     {
5478       /* When reusing an accumulator from the main loop, we only need
5479          initialization instructions if the main loop can be skipped.
5480          In that case, emit the initialization instructions at the end
5481          of the guard block that does the skip.  */
5482       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5483       gcc_assert (skip_edge);
5484       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5485       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5486     }
5487   else
5488     {
5489       /* The normal case: emit the initialization instructions on the
5490          preheader edge.  */
5491       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5492       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5493     }
5494 }
5495
5496 /* Function get_initial_def_for_reduction
5497
5498    Input:
5499    REDUC_INFO - the info_for_reduction
5500    INIT_VAL - the initial value of the reduction variable
5501    NEUTRAL_OP - a value that has no effect on the reduction, as per
5502                 neutral_op_for_reduction
5503
5504    Output:
5505    Return a vector variable, initialized according to the operation that
5506         STMT_VINFO performs. This vector will be used as the initial value
5507         of the vector of partial results.
5508
5509    The value we need is a vector in which element 0 has value INIT_VAL
5510    and every other element has value NEUTRAL_OP.  */
5511
5512 static tree
5513 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5514                                stmt_vec_info reduc_info,
5515                                tree init_val, tree neutral_op)
5516 {
5517   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5518   tree scalar_type = TREE_TYPE (init_val);
5519   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5520   tree init_def;
5521   gimple_seq stmts = NULL;
5522
5523   gcc_assert (vectype);
5524
5525   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5526               || SCALAR_FLOAT_TYPE_P (scalar_type));
5527
5528   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5529               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5530
5531   if (operand_equal_p (init_val, neutral_op))
5532     {
5533       /* If both elements are equal then the vector described above is
5534          just a splat.  */
5535       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5536       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5537     }
5538   else
5539     {
5540       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5541       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5542       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5543         {
5544           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5545              element 0.  */
5546           init_def = gimple_build_vector_from_val (&stmts, vectype,
5547                                                    neutral_op);
5548           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5549                                    vectype, init_def, init_val);
5550         }
5551       else
5552         {
5553           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
5554           tree_vector_builder elts (vectype, 1, 2);
5555           elts.quick_push (init_val);
5556           elts.quick_push (neutral_op);
5557           init_def = gimple_build_vector (&stmts, &elts);
5558         }
5559     }
5560
5561   if (stmts)
5562     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5563   return init_def;
5564 }
5565
5566 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5567    which performs a reduction involving GROUP_SIZE scalar statements.
5568    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
5569    is nonnull, introducing extra elements of that value will not change the
5570    result.  */
5571
5572 static void
5573 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5574                                 stmt_vec_info reduc_info,
5575                                 vec<tree> *vec_oprnds,
5576                                 unsigned int number_of_vectors,
5577                                 unsigned int group_size, tree neutral_op)
5578 {
5579   vec<tree> &initial_values = reduc_info->reduc_initial_values;
5580   unsigned HOST_WIDE_INT nunits;
5581   unsigned j, number_of_places_left_in_vector;
5582   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5583   unsigned int i;
5584
5585   gcc_assert (group_size == initial_values.length () || neutral_op);
5586
5587   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5588      created vectors. It is greater than 1 if unrolling is performed.
5589
5590      For example, we have two scalar operands, s1 and s2 (e.g., group of
5591      strided accesses of size two), while NUNITS is four (i.e., four scalars
5592      of this type can be packed in a vector).  The output vector will contain
5593      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
5594      will be 2).
5595
5596      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5597      vectors containing the operands.
5598
5599      For example, NUNITS is four as before, and the group size is 8
5600      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
5601      {s5, s6, s7, s8}.  */
5602
5603   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5604     nunits = group_size;
5605
5606   number_of_places_left_in_vector = nunits;
5607   bool constant_p = true;
5608   tree_vector_builder elts (vector_type, nunits, 1);
5609   elts.quick_grow (nunits);
5610   gimple_seq ctor_seq = NULL;
5611   for (j = 0; j < nunits * number_of_vectors; ++j)
5612     {
5613       tree op;
5614       i = j % group_size;
5615
5616       /* Get the def before the loop.  In reduction chain we have only
5617          one initial value.  Else we have as many as PHIs in the group.  */
5618       if (i >= initial_values.length () || (j > i && neutral_op))
5619         op = neutral_op;
5620       else
5621         op = initial_values[i];
5622
5623       /* Create 'vect_ = {op0,op1,...,opn}'.  */
5624       number_of_places_left_in_vector--;
5625       elts[nunits - number_of_places_left_in_vector - 1] = op;
5626       if (!CONSTANT_CLASS_P (op))
5627         constant_p = false;
5628
5629       if (number_of_places_left_in_vector == 0)
5630         {
5631           tree init;
5632           if (constant_p && !neutral_op
5633               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5634               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5635             /* Build the vector directly from ELTS.  */
5636             init = gimple_build_vector (&ctor_seq, &elts);
5637           else if (neutral_op)
5638             {
5639               /* Build a vector of the neutral value and shift the
5640                  other elements into place.  */
5641               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5642                                                    neutral_op);
5643               int k = nunits;
5644               while (k > 0 && elts[k - 1] == neutral_op)
5645                 k -= 1;
5646               while (k > 0)
5647                 {
5648                   k -= 1;
5649                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5650                                        vector_type, init, elts[k]);
5651                 }
5652             }
5653           else
5654             {
5655               /* First time round, duplicate ELTS to fill the
5656                  required number of vectors.  */
5657               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5658                                         elts, number_of_vectors, *vec_oprnds);
5659               break;
5660             }
5661           vec_oprnds->quick_push (init);
5662
5663           number_of_places_left_in_vector = nunits;
5664           elts.new_vector (vector_type, nunits, 1);
5665           elts.quick_grow (nunits);
5666           constant_p = true;
5667         }
5668     }
5669   if (ctor_seq != NULL)
5670     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5671 }
5672
5673 /* For a statement STMT_INFO taking part in a reduction operation return
5674    the stmt_vec_info the meta information is stored on.  */
5675
5676 stmt_vec_info
5677 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5678 {
5679   stmt_info = vect_orig_stmt (stmt_info);
5680   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5681   if (!is_a <gphi *> (stmt_info->stmt)
5682       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5683     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5684   gphi *phi = as_a <gphi *> (stmt_info->stmt);
5685   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5686     {
5687       if (gimple_phi_num_args (phi) == 1)
5688         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5689     }
5690   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5691     {
5692       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5693       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5694         stmt_info = info;
5695     }
5696   return stmt_info;
5697 }
5698
5699 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5700    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
5701    return false.  */
5702
5703 static bool
5704 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5705                                 stmt_vec_info reduc_info)
5706 {
5707   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5708   if (!main_loop_vinfo)
5709     return false;
5710
5711   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5712     return false;
5713
5714   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5715   auto_vec<tree, 16> main_loop_results (num_phis);
5716   auto_vec<tree, 16> initial_values (num_phis);
5717   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5718     {
5719       /* The epilogue loop can be entered either from the main loop or
5720          from an earlier guard block.  */
5721       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5722       for (tree incoming_value : reduc_info->reduc_initial_values)
5723         {
5724           /* Look for:
5725
5726                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5727                                     INITIAL_VALUE(guard block)>.  */
5728           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5729
5730           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5731           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5732
5733           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5734           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5735
5736           main_loop_results.quick_push (from_main_loop);
5737           initial_values.quick_push (from_skip);
5738         }
5739     }
5740   else
5741     /* The main loop dominates the epilogue loop.  */
5742     main_loop_results.splice (reduc_info->reduc_initial_values);
5743
5744   /* See if the main loop has the kind of accumulator we need.  */
5745   vect_reusable_accumulator *accumulator
5746     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5747   if (!accumulator
5748       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5749       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5750                       accumulator->reduc_info->reduc_scalar_results.begin ()))
5751     return false;
5752
5753   /* Handle the case where we can reduce wider vectors to narrower ones.  */
5754   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5755   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5756   unsigned HOST_WIDE_INT m;
5757   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5758                             TYPE_VECTOR_SUBPARTS (vectype), &m))
5759     return false;
5760   /* Check the intermediate vector types and operations are available.  */
5761   tree prev_vectype = old_vectype;
5762   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5763   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5764     {
5765       intermediate_nunits = exact_div (intermediate_nunits, 2);
5766       tree intermediate_vectype = get_related_vectype_for_scalar_type
5767         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5768       if (!intermediate_vectype
5769           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5770                                     intermediate_vectype)
5771           || !can_vec_extract (TYPE_MODE (prev_vectype),
5772                                TYPE_MODE (intermediate_vectype)))
5773         return false;
5774       prev_vectype = intermediate_vectype;
5775     }
5776
5777   /* Non-SLP reductions might apply an adjustment after the reduction
5778      operation, in order to simplify the initialization of the accumulator.
5779      If the epilogue loop carries on from where the main loop left off,
5780      it should apply the same adjustment to the final reduction result.
5781
5782      If the epilogue loop can also be entered directly (rather than via
5783      the main loop), we need to be able to handle that case in the same way,
5784      with the same adjustment.  (In principle we could add a PHI node
5785      to select the correct adjustment, but in practice that shouldn't be
5786      necessary.)  */
5787   tree main_adjustment
5788     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5789   if (loop_vinfo->main_loop_edge && main_adjustment)
5790     {
5791       gcc_assert (num_phis == 1);
5792       tree initial_value = initial_values[0];
5793       /* Check that we can use INITIAL_VALUE as the adjustment and
5794          initialize the accumulator with a neutral value instead.  */
5795       if (!operand_equal_p (initial_value, main_adjustment))
5796         return false;
5797       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5798       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5799                                                     code, initial_value);
5800     }
5801   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5802   reduc_info->reduc_initial_values.truncate (0);
5803   reduc_info->reduc_initial_values.splice (initial_values);
5804   reduc_info->reused_accumulator = accumulator;
5805   return true;
5806 }
5807
5808 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5809    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5810
5811 static tree
5812 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5813                             gimple_seq *seq)
5814 {
5815   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5816   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5817   tree stype = TREE_TYPE (vectype);
5818   tree new_temp = vec_def;
5819   while (nunits > nunits1)
5820     {
5821       nunits /= 2;
5822       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5823                                                            stype, nunits);
5824       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5825
5826       /* The target has to make sure we support lowpart/highpart
5827          extraction, either via direct vector extract or through
5828          an integer mode punning.  */
5829       tree dst1, dst2;
5830       gimple *epilog_stmt;
5831       if (convert_optab_handler (vec_extract_optab,
5832                                  TYPE_MODE (TREE_TYPE (new_temp)),
5833                                  TYPE_MODE (vectype1))
5834           != CODE_FOR_nothing)
5835         {
5836           /* Extract sub-vectors directly once vec_extract becomes
5837              a conversion optab.  */
5838           dst1 = make_ssa_name (vectype1);
5839           epilog_stmt
5840               = gimple_build_assign (dst1, BIT_FIELD_REF,
5841                                      build3 (BIT_FIELD_REF, vectype1,
5842                                              new_temp, TYPE_SIZE (vectype1),
5843                                              bitsize_int (0)));
5844           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5845           dst2 =  make_ssa_name (vectype1);
5846           epilog_stmt
5847               = gimple_build_assign (dst2, BIT_FIELD_REF,
5848                                      build3 (BIT_FIELD_REF, vectype1,
5849                                              new_temp, TYPE_SIZE (vectype1),
5850                                              bitsize_int (bitsize)));
5851           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5852         }
5853       else
5854         {
5855           /* Extract via punning to appropriately sized integer mode
5856              vector.  */
5857           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5858           tree etype = build_vector_type (eltype, 2);
5859           gcc_assert (convert_optab_handler (vec_extract_optab,
5860                                              TYPE_MODE (etype),
5861                                              TYPE_MODE (eltype))
5862                       != CODE_FOR_nothing);
5863           tree tem = make_ssa_name (etype);
5864           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5865                                              build1 (VIEW_CONVERT_EXPR,
5866                                                      etype, new_temp));
5867           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5868           new_temp = tem;
5869           tem = make_ssa_name (eltype);
5870           epilog_stmt
5871               = gimple_build_assign (tem, BIT_FIELD_REF,
5872                                      build3 (BIT_FIELD_REF, eltype,
5873                                              new_temp, TYPE_SIZE (eltype),
5874                                              bitsize_int (0)));
5875           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5876           dst1 = make_ssa_name (vectype1);
5877           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5878                                              build1 (VIEW_CONVERT_EXPR,
5879                                                      vectype1, tem));
5880           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5881           tem = make_ssa_name (eltype);
5882           epilog_stmt
5883               = gimple_build_assign (tem, BIT_FIELD_REF,
5884                                      build3 (BIT_FIELD_REF, eltype,
5885                                              new_temp, TYPE_SIZE (eltype),
5886                                              bitsize_int (bitsize)));
5887           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5888           dst2 =  make_ssa_name (vectype1);
5889           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5890                                              build1 (VIEW_CONVERT_EXPR,
5891                                                      vectype1, tem));
5892           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5893         }
5894
5895       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5896     }
5897
5898   return new_temp;
5899 }
5900
5901 /* Retrieves the definining statement to be used for a reduction.
5902    For MAIN_EXIT_P we use the current VEC_STMTs and otherwise we look at
5903    the reduction definitions.  */
5904
5905 tree
5906 vect_get_vect_def (stmt_vec_info reduc_info, slp_tree slp_node,
5907                    slp_instance slp_node_instance, bool main_exit_p, unsigned i,
5908                    vec <gimple *> &vec_stmts)
5909 {
5910   tree def;
5911
5912   if (slp_node)
5913     {
5914       if (!main_exit_p)
5915         slp_node = slp_node_instance->reduc_phis;
5916       def = vect_get_slp_vect_def (slp_node, i);
5917     }
5918   else
5919     {
5920       if (!main_exit_p)
5921         reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (reduc_info));
5922       vec_stmts = STMT_VINFO_VEC_STMTS (reduc_info);
5923       def = gimple_get_lhs (vec_stmts[0]);
5924     }
5925
5926   return def;
5927 }
5928
5929 /* Function vect_create_epilog_for_reduction
5930
5931    Create code at the loop-epilog to finalize the result of a reduction
5932    computation.
5933
5934    STMT_INFO is the scalar reduction stmt that is being vectorized.
5935    SLP_NODE is an SLP node containing a group of reduction statements. The
5936      first one in this group is STMT_INFO.
5937    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5938    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5939      (counting from 0)
5940    LOOP_EXIT is the edge to update in the merge block.  In the case of a single
5941      exit this edge is always the main loop exit.
5942
5943    This function:
5944    1. Completes the reduction def-use cycles.
5945    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5946       by calling the function specified by REDUC_FN if available, or by
5947       other means (whole-vector shifts or a scalar loop).
5948       The function also creates a new phi node at the loop exit to preserve
5949       loop-closed form, as illustrated below.
5950
5951      The flow at the entry to this function:
5952
5953         loop:
5954           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5955           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5956           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5957         loop_exit:
5958           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5959           use <s_out0>
5960           use <s_out0>
5961
5962      The above is transformed by this function into:
5963
5964         loop:
5965           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5966           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5967           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5968         loop_exit:
5969           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5970           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5971           v_out2 = reduce <v_out1>
5972           s_out3 = extract_field <v_out2, 0>
5973           s_out4 = adjust_result <s_out3>
5974           use <s_out4>
5975           use <s_out4>
5976 */
5977
5978 static void
5979 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5980                                   stmt_vec_info stmt_info,
5981                                   slp_tree slp_node,
5982                                   slp_instance slp_node_instance,
5983                                   edge loop_exit)
5984 {
5985   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5986   gcc_assert (reduc_info->is_reduc_info);
5987   /* For double reductions we need to get at the inner loop reduction
5988      stmt which has the meta info attached.  Our stmt_info is that of the
5989      loop-closed PHI of the inner loop which we remember as
5990      def for the reduction PHI generation.  */
5991   bool double_reduc = false;
5992   bool main_exit_p = LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit;
5993   stmt_vec_info rdef_info = stmt_info;
5994   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5995     {
5996       gcc_assert (!slp_node);
5997       double_reduc = true;
5998       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5999                                             (stmt_info->stmt, 0));
6000       stmt_info = vect_stmt_to_vectorize (stmt_info);
6001     }
6002   gphi *reduc_def_stmt
6003     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
6004   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
6005   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
6006   tree vectype;
6007   machine_mode mode;
6008   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
6009   basic_block exit_bb;
6010   tree scalar_dest;
6011   tree scalar_type;
6012   gimple *new_phi = NULL, *phi = NULL;
6013   gimple_stmt_iterator exit_gsi;
6014   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
6015   gimple *epilog_stmt = NULL;
6016   gimple *exit_phi;
6017   tree bitsize;
6018   tree def;
6019   tree orig_name, scalar_result;
6020   imm_use_iterator imm_iter, phi_imm_iter;
6021   use_operand_p use_p, phi_use_p;
6022   gimple *use_stmt;
6023   auto_vec<tree> reduc_inputs;
6024   int j, i;
6025   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
6026   unsigned int group_size = 1, k;
6027   auto_vec<gimple *> phis;
6028   /* SLP reduction without reduction chain, e.g.,
6029      # a1 = phi <a2, a0>
6030      # b1 = phi <b2, b0>
6031      a2 = operation (a1)
6032      b2 = operation (b1)  */
6033   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
6034   bool direct_slp_reduc;
6035   tree induction_index = NULL_TREE;
6036
6037   if (slp_node)
6038     group_size = SLP_TREE_LANES (slp_node);
6039
6040   if (nested_in_vect_loop_p (loop, stmt_info))
6041     {
6042       outer_loop = loop;
6043       loop = loop->inner;
6044       gcc_assert (!slp_node && double_reduc);
6045     }
6046
6047   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
6048   gcc_assert (vectype);
6049   mode = TYPE_MODE (vectype);
6050
6051   tree induc_val = NULL_TREE;
6052   tree adjustment_def = NULL;
6053   if (slp_node)
6054     ;
6055   else
6056     {
6057       /* Optimize: for induction condition reduction, if we can't use zero
6058          for induc_val, use initial_def.  */
6059       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6060         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
6061       else if (double_reduc)
6062         ;
6063       else
6064         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
6065     }
6066
6067   stmt_vec_info single_live_out_stmt[] = { stmt_info };
6068   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
6069   if (slp_reduc)
6070     /* All statements produce live-out values.  */
6071     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
6072   else if (slp_node)
6073     {
6074       /* The last statement in the reduction chain produces the live-out
6075          value.  Note SLP optimization can shuffle scalar stmts to
6076          optimize permutations so we have to search for the last stmt.  */
6077       for (k = 0; k < group_size; ++k)
6078         if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
6079           {
6080             single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
6081             break;
6082           }
6083     }
6084
6085   unsigned vec_num;
6086   int ncopies;
6087   if (slp_node)
6088     {
6089       vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
6090       ncopies = 1;
6091     }
6092   else
6093     {
6094       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
6095       vec_num = 1;
6096       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
6097     }
6098
6099   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6100      which is updated with the current index of the loop for every match of
6101      the original loop's cond_expr (VEC_STMT).  This results in a vector
6102      containing the last time the condition passed for that vector lane.
6103      The first match will be a 1 to allow 0 to be used for non-matching
6104      indexes.  If there are no matches at all then the vector will be all
6105      zeroes.
6106
6107      PR92772: This algorithm is broken for architectures that support
6108      masked vectors, but do not provide fold_extract_last.  */
6109   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6110     {
6111       auto_vec<std::pair<tree, bool>, 2> ccompares;
6112       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6113       cond_info = vect_stmt_to_vectorize (cond_info);
6114       while (cond_info != reduc_info)
6115         {
6116           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6117             {
6118               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6119               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6120               ccompares.safe_push
6121                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
6122                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
6123             }
6124           cond_info
6125             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6126                                                  1 + STMT_VINFO_REDUC_IDX
6127                                                         (cond_info)));
6128           cond_info = vect_stmt_to_vectorize (cond_info);
6129         }
6130       gcc_assert (ccompares.length () != 0);
6131
6132       tree indx_before_incr, indx_after_incr;
6133       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6134       int scalar_precision
6135         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6136       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6137       tree cr_index_vector_type = get_related_vectype_for_scalar_type
6138         (TYPE_MODE (vectype), cr_index_scalar_type,
6139          TYPE_VECTOR_SUBPARTS (vectype));
6140
6141       /* First we create a simple vector induction variable which starts
6142          with the values {1,2,3,...} (SERIES_VECT) and increments by the
6143          vector size (STEP).  */
6144
6145       /* Create a {1,2,3,...} vector.  */
6146       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6147
6148       /* Create a vector of the step value.  */
6149       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6150       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6151
6152       /* Create an induction variable.  */
6153       gimple_stmt_iterator incr_gsi;
6154       bool insert_after;
6155       vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
6156       create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6157                  insert_after, &indx_before_incr, &indx_after_incr);
6158
6159       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6160          filled with zeros (VEC_ZERO).  */
6161
6162       /* Create a vector of 0s.  */
6163       tree zero = build_zero_cst (cr_index_scalar_type);
6164       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6165
6166       /* Create a vector phi node.  */
6167       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6168       new_phi = create_phi_node (new_phi_tree, loop->header);
6169       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6170                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
6171
6172       /* Now take the condition from the loops original cond_exprs
6173          and produce a new cond_exprs (INDEX_COND_EXPR) which for
6174          every match uses values from the induction variable
6175          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6176          (NEW_PHI_TREE).
6177          Finally, we update the phi (NEW_PHI_TREE) to take the value of
6178          the new cond_expr (INDEX_COND_EXPR).  */
6179       gimple_seq stmts = NULL;
6180       for (int i = ccompares.length () - 1; i != -1; --i)
6181         {
6182           tree ccompare = ccompares[i].first;
6183           if (ccompares[i].second)
6184             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6185                                          cr_index_vector_type,
6186                                          ccompare,
6187                                          indx_before_incr, new_phi_tree);
6188           else
6189             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6190                                          cr_index_vector_type,
6191                                          ccompare,
6192                                          new_phi_tree, indx_before_incr);
6193         }
6194       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6195
6196       /* Update the phi with the vec cond.  */
6197       induction_index = new_phi_tree;
6198       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6199                    loop_latch_edge (loop), UNKNOWN_LOCATION);
6200     }
6201
6202   /* 2. Create epilog code.
6203         The reduction epilog code operates across the elements of the vector
6204         of partial results computed by the vectorized loop.
6205         The reduction epilog code consists of:
6206
6207         step 1: compute the scalar result in a vector (v_out2)
6208         step 2: extract the scalar result (s_out3) from the vector (v_out2)
6209         step 3: adjust the scalar result (s_out3) if needed.
6210
6211         Step 1 can be accomplished using one the following three schemes:
6212           (scheme 1) using reduc_fn, if available.
6213           (scheme 2) using whole-vector shifts, if available.
6214           (scheme 3) using a scalar loop. In this case steps 1+2 above are
6215                      combined.
6216
6217           The overall epilog code looks like this:
6218
6219           s_out0 = phi <s_loop>         # original EXIT_PHI
6220           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
6221           v_out2 = reduce <v_out1>              # step 1
6222           s_out3 = extract_field <v_out2, 0>    # step 2
6223           s_out4 = adjust_result <s_out3>       # step 3
6224
6225           (step 3 is optional, and steps 1 and 2 may be combined).
6226           Lastly, the uses of s_out0 are replaced by s_out4.  */
6227
6228
6229   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6230          v_out1 = phi <VECT_DEF>
6231          Store them in NEW_PHIS.  */
6232   if (double_reduc)
6233     loop = outer_loop;
6234   /* We need to reduce values in all exits.  */
6235   exit_bb = loop_exit->dest;
6236   exit_gsi = gsi_after_labels (exit_bb);
6237   reduc_inputs.create (slp_node ? vec_num : ncopies);
6238   vec <gimple *> vec_stmts = vNULL;
6239   for (unsigned i = 0; i < vec_num; i++)
6240     {
6241       gimple_seq stmts = NULL;
6242       def = vect_get_vect_def (rdef_info, slp_node, slp_node_instance,
6243                                main_exit_p, i, vec_stmts);
6244       for (j = 0; j < ncopies; j++)
6245         {
6246           tree new_def = copy_ssa_name (def);
6247           phi = create_phi_node (new_def, exit_bb);
6248           if (j)
6249             def = gimple_get_lhs (vec_stmts[j]);
6250           if (LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit)
6251             SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
6252           else
6253             {
6254               for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
6255                 SET_PHI_ARG_DEF (phi, k, def);
6256             }
6257           new_def = gimple_convert (&stmts, vectype, new_def);
6258           reduc_inputs.quick_push (new_def);
6259         }
6260       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6261     }
6262
6263   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6264          (i.e. when reduc_fn is not available) and in the final adjustment
6265          code (if needed).  Also get the original scalar reduction variable as
6266          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
6267          represents a reduction pattern), the tree-code and scalar-def are
6268          taken from the original stmt that the pattern-stmt (STMT) replaces.
6269          Otherwise (it is a regular reduction) - the tree-code and scalar-def
6270          are taken from STMT.  */
6271
6272   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6273   if (orig_stmt_info != stmt_info)
6274     {
6275       /* Reduction pattern  */
6276       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6277       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6278     }
6279
6280   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6281   scalar_type = TREE_TYPE (scalar_dest);
6282   scalar_results.truncate (0);
6283   scalar_results.reserve_exact (group_size);
6284   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6285   bitsize = TYPE_SIZE (scalar_type);
6286
6287   /* True if we should implement SLP_REDUC using native reduction operations
6288      instead of scalar operations.  */
6289   direct_slp_reduc = (reduc_fn != IFN_LAST
6290                       && slp_reduc
6291                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6292
6293   /* In case of reduction chain, e.g.,
6294      # a1 = phi <a3, a0>
6295      a2 = operation (a1)
6296      a3 = operation (a2),
6297
6298      we may end up with more than one vector result.  Here we reduce them
6299      to one vector.
6300
6301      The same is true for a SLP reduction, e.g.,
6302      # a1 = phi <a2, a0>
6303      # b1 = phi <b2, b0>
6304      a2 = operation (a1)
6305      b2 = operation (a2),
6306
6307      where we can end up with more than one vector as well.  We can
6308      easily accumulate vectors when the number of vector elements is
6309      a multiple of the SLP group size.
6310
6311      The same is true if we couldn't use a single defuse cycle.  */
6312   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6313       || direct_slp_reduc
6314       || (slp_reduc
6315           && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6316       || ncopies > 1)
6317     {
6318       gimple_seq stmts = NULL;
6319       tree single_input = reduc_inputs[0];
6320       for (k = 1; k < reduc_inputs.length (); k++)
6321         single_input = gimple_build (&stmts, code, vectype,
6322                                      single_input, reduc_inputs[k]);
6323       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6324
6325       reduc_inputs.truncate (0);
6326       reduc_inputs.safe_push (single_input);
6327     }
6328
6329   tree orig_reduc_input = reduc_inputs[0];
6330
6331   /* If this loop is an epilogue loop that can be skipped after the
6332      main loop, we can only share a reduction operation between the
6333      main loop and the epilogue if we put it at the target of the
6334      skip edge.
6335
6336      We can still reuse accumulators if this check fails.  Doing so has
6337      the minor(?) benefit of making the epilogue loop's scalar result
6338      independent of the main loop's scalar result.  */
6339   bool unify_with_main_loop_p = false;
6340   if (reduc_info->reused_accumulator
6341       && loop_vinfo->skip_this_loop_edge
6342       && single_succ_p (exit_bb)
6343       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6344     {
6345       unify_with_main_loop_p = true;
6346
6347       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6348       reduc_inputs[0] = make_ssa_name (vectype);
6349       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6350       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6351                    UNKNOWN_LOCATION);
6352       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6353                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6354       exit_gsi = gsi_after_labels (reduc_block);
6355     }
6356
6357   /* Shouldn't be used beyond this point.  */
6358   exit_bb = nullptr;
6359
6360   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6361       && reduc_fn != IFN_LAST)
6362     {
6363       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6364          various data values where the condition matched and another vector
6365          (INDUCTION_INDEX) containing all the indexes of those matches.  We
6366          need to extract the last matching index (which will be the index with
6367          highest value) and use this to index into the data vector.
6368          For the case where there were no matches, the data vector will contain
6369          all default values and the index vector will be all zeros.  */
6370
6371       /* Get various versions of the type of the vector of indexes.  */
6372       tree index_vec_type = TREE_TYPE (induction_index);
6373       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6374       tree index_scalar_type = TREE_TYPE (index_vec_type);
6375       tree index_vec_cmp_type = truth_type_for (index_vec_type);
6376
6377       /* Get an unsigned integer version of the type of the data vector.  */
6378       int scalar_precision
6379         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6380       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6381       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6382                                                 vectype);
6383
6384       /* First we need to create a vector (ZERO_VEC) of zeros and another
6385          vector (MAX_INDEX_VEC) filled with the last matching index, which we
6386          can create using a MAX reduction and then expanding.
6387          In the case where the loop never made any matches, the max index will
6388          be zero.  */
6389
6390       /* Vector of {0, 0, 0,...}.  */
6391       tree zero_vec = build_zero_cst (vectype);
6392
6393       /* Find maximum value from the vector of found indexes.  */
6394       tree max_index = make_ssa_name (index_scalar_type);
6395       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6396                                                           1, induction_index);
6397       gimple_call_set_lhs (max_index_stmt, max_index);
6398       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6399
6400       /* Vector of {max_index, max_index, max_index,...}.  */
6401       tree max_index_vec = make_ssa_name (index_vec_type);
6402       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6403                                                       max_index);
6404       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6405                                                         max_index_vec_rhs);
6406       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6407
6408       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6409          with the vector (INDUCTION_INDEX) of found indexes, choosing values
6410          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6411          otherwise.  Only one value should match, resulting in a vector
6412          (VEC_COND) with one data value and the rest zeros.
6413          In the case where the loop never made any matches, every index will
6414          match, resulting in a vector with all data values (which will all be
6415          the default value).  */
6416
6417       /* Compare the max index vector to the vector of found indexes to find
6418          the position of the max value.  */
6419       tree vec_compare = make_ssa_name (index_vec_cmp_type);
6420       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6421                                                       induction_index,
6422                                                       max_index_vec);
6423       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6424
6425       /* Use the compare to choose either values from the data vector or
6426          zero.  */
6427       tree vec_cond = make_ssa_name (vectype);
6428       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6429                                                    vec_compare,
6430                                                    reduc_inputs[0],
6431                                                    zero_vec);
6432       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6433
6434       /* Finally we need to extract the data value from the vector (VEC_COND)
6435          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
6436          reduction, but because this doesn't exist, we can use a MAX reduction
6437          instead.  The data value might be signed or a float so we need to cast
6438          it first.
6439          In the case where the loop never made any matches, the data values are
6440          all identical, and so will reduce down correctly.  */
6441
6442       /* Make the matched data values unsigned.  */
6443       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6444       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6445                                        vec_cond);
6446       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6447                                                         VIEW_CONVERT_EXPR,
6448                                                         vec_cond_cast_rhs);
6449       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6450
6451       /* Reduce down to a scalar value.  */
6452       tree data_reduc = make_ssa_name (scalar_type_unsigned);
6453       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6454                                                            1, vec_cond_cast);
6455       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6456       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6457
6458       /* Convert the reduced value back to the result type and set as the
6459          result.  */
6460       gimple_seq stmts = NULL;
6461       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6462                                data_reduc);
6463       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6464       scalar_results.safe_push (new_temp);
6465     }
6466   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6467            && reduc_fn == IFN_LAST)
6468     {
6469       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
6470          idx = 0;
6471          idx_val = induction_index[0];
6472          val = data_reduc[0];
6473          for (idx = 0, val = init, i = 0; i < nelts; ++i)
6474            if (induction_index[i] > idx_val)
6475              val = data_reduc[i], idx_val = induction_index[i];
6476          return val;  */
6477
6478       tree data_eltype = TREE_TYPE (vectype);
6479       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6480       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6481       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6482       /* Enforced by vectorizable_reduction, which ensures we have target
6483          support before allowing a conditional reduction on variable-length
6484          vectors.  */
6485       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6486       tree idx_val = NULL_TREE, val = NULL_TREE;
6487       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6488         {
6489           tree old_idx_val = idx_val;
6490           tree old_val = val;
6491           idx_val = make_ssa_name (idx_eltype);
6492           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6493                                              build3 (BIT_FIELD_REF, idx_eltype,
6494                                                      induction_index,
6495                                                      bitsize_int (el_size),
6496                                                      bitsize_int (off)));
6497           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6498           val = make_ssa_name (data_eltype);
6499           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6500                                              build3 (BIT_FIELD_REF,
6501                                                      data_eltype,
6502                                                      reduc_inputs[0],
6503                                                      bitsize_int (el_size),
6504                                                      bitsize_int (off)));
6505           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6506           if (off != 0)
6507             {
6508               tree new_idx_val = idx_val;
6509               if (off != v_size - el_size)
6510                 {
6511                   new_idx_val = make_ssa_name (idx_eltype);
6512                   epilog_stmt = gimple_build_assign (new_idx_val,
6513                                                      MAX_EXPR, idx_val,
6514                                                      old_idx_val);
6515                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6516                 }
6517               tree cond = make_ssa_name (boolean_type_node);
6518               epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6519                                                  idx_val, old_idx_val);
6520               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6521               tree new_val = make_ssa_name (data_eltype);
6522               epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6523                                                  cond, val, old_val);
6524               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6525               idx_val = new_idx_val;
6526               val = new_val;
6527             }
6528         }
6529       /* Convert the reduced value back to the result type and set as the
6530          result.  */
6531       gimple_seq stmts = NULL;
6532       val = gimple_convert (&stmts, scalar_type, val);
6533       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6534       scalar_results.safe_push (val);
6535     }
6536
6537   /* 2.3 Create the reduction code, using one of the three schemes described
6538          above. In SLP we simply need to extract all the elements from the
6539          vector (without reducing them), so we use scalar shifts.  */
6540   else if (reduc_fn != IFN_LAST && !slp_reduc)
6541     {
6542       tree tmp;
6543       tree vec_elem_type;
6544
6545       /* Case 1:  Create:
6546          v_out2 = reduc_expr <v_out1>  */
6547
6548       if (dump_enabled_p ())
6549         dump_printf_loc (MSG_NOTE, vect_location,
6550                          "Reduce using direct vector reduction.\n");
6551
6552       gimple_seq stmts = NULL;
6553       vec_elem_type = TREE_TYPE (vectype);
6554       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6555                                vec_elem_type, reduc_inputs[0]);
6556       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6557       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6558
6559       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6560           && induc_val)
6561         {
6562           /* Earlier we set the initial value to be a vector if induc_val
6563              values.  Check the result and if it is induc_val then replace
6564              with the original initial value, unless induc_val is
6565              the same as initial_def already.  */
6566           tree zcompare = make_ssa_name (boolean_type_node);
6567           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6568                                              new_temp, induc_val);
6569           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6570           tree initial_def = reduc_info->reduc_initial_values[0];
6571           tmp = make_ssa_name (new_scalar_dest);
6572           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6573                                              initial_def, new_temp);
6574           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6575           new_temp = tmp;
6576         }
6577
6578       scalar_results.safe_push (new_temp);
6579     }
6580   else if (direct_slp_reduc)
6581     {
6582       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6583          with the elements for other SLP statements replaced with the
6584          neutral value.  We can then do a normal reduction on each vector.  */
6585
6586       /* Enforced by vectorizable_reduction.  */
6587       gcc_assert (reduc_inputs.length () == 1);
6588       gcc_assert (pow2p_hwi (group_size));
6589
6590       gimple_seq seq = NULL;
6591
6592       /* Build a vector {0, 1, 2, ...}, with the same number of elements
6593          and the same element size as VECTYPE.  */
6594       tree index = build_index_vector (vectype, 0, 1);
6595       tree index_type = TREE_TYPE (index);
6596       tree index_elt_type = TREE_TYPE (index_type);
6597       tree mask_type = truth_type_for (index_type);
6598
6599       /* Create a vector that, for each element, identifies which of
6600          the REDUC_GROUP_SIZE results should use it.  */
6601       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6602       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6603                             build_vector_from_val (index_type, index_mask));
6604
6605       /* Get a neutral vector value.  This is simply a splat of the neutral
6606          scalar value if we have one, otherwise the initial scalar value
6607          is itself a neutral value.  */
6608       tree vector_identity = NULL_TREE;
6609       tree neutral_op = NULL_TREE;
6610       if (slp_node)
6611         {
6612           tree initial_value = NULL_TREE;
6613           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6614             initial_value = reduc_info->reduc_initial_values[0];
6615           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6616                                                  initial_value, false);
6617         }
6618       if (neutral_op)
6619         vector_identity = gimple_build_vector_from_val (&seq, vectype,
6620                                                         neutral_op);
6621       for (unsigned int i = 0; i < group_size; ++i)
6622         {
6623           /* If there's no univeral neutral value, we can use the
6624              initial scalar value from the original PHI.  This is used
6625              for MIN and MAX reduction, for example.  */
6626           if (!neutral_op)
6627             {
6628               tree scalar_value = reduc_info->reduc_initial_values[i];
6629               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6630                                              scalar_value);
6631               vector_identity = gimple_build_vector_from_val (&seq, vectype,
6632                                                               scalar_value);
6633             }
6634
6635           /* Calculate the equivalent of:
6636
6637              sel[j] = (index[j] == i);
6638
6639              which selects the elements of REDUC_INPUTS[0] that should
6640              be included in the result.  */
6641           tree compare_val = build_int_cst (index_elt_type, i);
6642           compare_val = build_vector_from_val (index_type, compare_val);
6643           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6644                                    index, compare_val);
6645
6646           /* Calculate the equivalent of:
6647
6648              vec = seq ? reduc_inputs[0] : vector_identity;
6649
6650              VEC is now suitable for a full vector reduction.  */
6651           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6652                                    sel, reduc_inputs[0], vector_identity);
6653
6654           /* Do the reduction and convert it to the appropriate type.  */
6655           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6656                                       TREE_TYPE (vectype), vec);
6657           scalar = gimple_convert (&seq, scalar_type, scalar);
6658           scalar_results.safe_push (scalar);
6659         }
6660       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6661     }
6662   else
6663     {
6664       bool reduce_with_shift;
6665       tree vec_temp;
6666
6667       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6668
6669       /* See if the target wants to do the final (shift) reduction
6670          in a vector mode of smaller size and first reduce upper/lower
6671          halves against each other.  */
6672       enum machine_mode mode1 = mode;
6673       tree stype = TREE_TYPE (vectype);
6674       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6675       unsigned nunits1 = nunits;
6676       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6677           && reduc_inputs.length () == 1)
6678         {
6679           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6680           /* For SLP reductions we have to make sure lanes match up, but
6681              since we're doing individual element final reduction reducing
6682              vector width here is even more important.
6683              ???  We can also separate lanes with permutes, for the common
6684              case of power-of-two group-size odd/even extracts would work.  */
6685           if (slp_reduc && nunits != nunits1)
6686             {
6687               nunits1 = least_common_multiple (nunits1, group_size);
6688               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6689             }
6690         }
6691       if (!slp_reduc
6692           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6693         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6694
6695       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6696                                                            stype, nunits1);
6697       reduce_with_shift = have_whole_vector_shift (mode1);
6698       if (!VECTOR_MODE_P (mode1)
6699           || !directly_supported_p (code, vectype1))
6700         reduce_with_shift = false;
6701
6702       /* First reduce the vector to the desired vector size we should
6703          do shift reduction on by combining upper and lower halves.  */
6704       gimple_seq stmts = NULL;
6705       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6706                                              code, &stmts);
6707       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6708       reduc_inputs[0] = new_temp;
6709
6710       if (reduce_with_shift && !slp_reduc)
6711         {
6712           int element_bitsize = tree_to_uhwi (bitsize);
6713           /* Enforced by vectorizable_reduction, which disallows SLP reductions
6714              for variable-length vectors and also requires direct target support
6715              for loop reductions.  */
6716           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6717           int nelements = vec_size_in_bits / element_bitsize;
6718           vec_perm_builder sel;
6719           vec_perm_indices indices;
6720
6721           int elt_offset;
6722
6723           tree zero_vec = build_zero_cst (vectype1);
6724           /* Case 2: Create:
6725              for (offset = nelements/2; offset >= 1; offset/=2)
6726                 {
6727                   Create:  va' = vec_shift <va, offset>
6728                   Create:  va = vop <va, va'>
6729                 }  */
6730
6731           tree rhs;
6732
6733           if (dump_enabled_p ())
6734             dump_printf_loc (MSG_NOTE, vect_location,
6735                              "Reduce using vector shifts\n");
6736
6737           gimple_seq stmts = NULL;
6738           new_temp = gimple_convert (&stmts, vectype1, new_temp);
6739           for (elt_offset = nelements / 2;
6740                elt_offset >= 1;
6741                elt_offset /= 2)
6742             {
6743               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6744               indices.new_vector (sel, 2, nelements);
6745               tree mask = vect_gen_perm_mask_any (vectype1, indices);
6746               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6747                                        new_temp, zero_vec, mask);
6748               new_temp = gimple_build (&stmts, code,
6749                                        vectype1, new_name, new_temp);
6750             }
6751           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6752
6753           /* 2.4  Extract the final scalar result.  Create:
6754              s_out3 = extract_field <v_out2, bitpos>  */
6755
6756           if (dump_enabled_p ())
6757             dump_printf_loc (MSG_NOTE, vect_location,
6758                              "extract scalar result\n");
6759
6760           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6761                         bitsize, bitsize_zero_node);
6762           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6763           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6764           gimple_assign_set_lhs (epilog_stmt, new_temp);
6765           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6766           scalar_results.safe_push (new_temp);
6767         }
6768       else
6769         {
6770           /* Case 3: Create:
6771              s = extract_field <v_out2, 0>
6772              for (offset = element_size;
6773                   offset < vector_size;
6774                   offset += element_size;)
6775                {
6776                  Create:  s' = extract_field <v_out2, offset>
6777                  Create:  s = op <s, s'>  // For non SLP cases
6778                }  */
6779
6780           if (dump_enabled_p ())
6781             dump_printf_loc (MSG_NOTE, vect_location,
6782                              "Reduce using scalar code.\n");
6783
6784           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6785           int element_bitsize = tree_to_uhwi (bitsize);
6786           tree compute_type = TREE_TYPE (vectype);
6787           gimple_seq stmts = NULL;
6788           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6789             {
6790               int bit_offset;
6791               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6792                                        vec_temp, bitsize, bitsize_zero_node);
6793
6794               /* In SLP we don't need to apply reduction operation, so we just
6795                  collect s' values in SCALAR_RESULTS.  */
6796               if (slp_reduc)
6797                 scalar_results.safe_push (new_temp);
6798
6799               for (bit_offset = element_bitsize;
6800                    bit_offset < vec_size_in_bits;
6801                    bit_offset += element_bitsize)
6802                 {
6803                   tree bitpos = bitsize_int (bit_offset);
6804                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
6805                                            compute_type, vec_temp,
6806                                            bitsize, bitpos);
6807                   if (slp_reduc)
6808                     {
6809                       /* In SLP we don't need to apply reduction operation, so
6810                          we just collect s' values in SCALAR_RESULTS.  */
6811                       new_temp = new_name;
6812                       scalar_results.safe_push (new_name);
6813                     }
6814                   else
6815                     new_temp = gimple_build (&stmts, code, compute_type,
6816                                              new_name, new_temp);
6817                 }
6818             }
6819
6820           /* The only case where we need to reduce scalar results in SLP, is
6821              unrolling.  If the size of SCALAR_RESULTS is greater than
6822              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6823              REDUC_GROUP_SIZE.  */
6824           if (slp_reduc)
6825             {
6826               tree res, first_res, new_res;
6827
6828               /* Reduce multiple scalar results in case of SLP unrolling.  */
6829               for (j = group_size; scalar_results.iterate (j, &res);
6830                    j++)
6831                 {
6832                   first_res = scalar_results[j % group_size];
6833                   new_res = gimple_build (&stmts, code, compute_type,
6834                                           first_res, res);
6835                   scalar_results[j % group_size] = new_res;
6836                 }
6837               scalar_results.truncate (group_size);
6838               for (k = 0; k < group_size; k++)
6839                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6840                                                     scalar_results[k]);
6841             }
6842           else
6843             {
6844               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6845               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6846               scalar_results.safe_push (new_temp);
6847             }
6848
6849           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6850         }
6851
6852       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6853           && induc_val)
6854         {
6855           /* Earlier we set the initial value to be a vector if induc_val
6856              values.  Check the result and if it is induc_val then replace
6857              with the original initial value, unless induc_val is
6858              the same as initial_def already.  */
6859           tree zcompare = make_ssa_name (boolean_type_node);
6860           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6861                                              induc_val);
6862           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6863           tree initial_def = reduc_info->reduc_initial_values[0];
6864           tree tmp = make_ssa_name (new_scalar_dest);
6865           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6866                                              initial_def, new_temp);
6867           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6868           scalar_results[0] = tmp;
6869         }
6870     }
6871
6872   /* 2.5 Adjust the final result by the initial value of the reduction
6873          variable. (When such adjustment is not needed, then
6874          'adjustment_def' is zero).  For example, if code is PLUS we create:
6875          new_temp = loop_exit_def + adjustment_def  */
6876
6877   if (adjustment_def)
6878     {
6879       gcc_assert (!slp_reduc);
6880       gimple_seq stmts = NULL;
6881       if (double_reduc)
6882         {
6883           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6884           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6885           new_temp = gimple_build (&stmts, code, vectype,
6886                                    reduc_inputs[0], adjustment_def);
6887         }
6888       else
6889         {
6890           new_temp = scalar_results[0];
6891           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6892           adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6893                                            adjustment_def);
6894           new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6895           new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6896                                    new_temp, adjustment_def);
6897           new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6898         }
6899
6900       epilog_stmt = gimple_seq_last_stmt (stmts);
6901       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6902       scalar_results[0] = new_temp;
6903     }
6904
6905   /* Record this operation if it could be reused by the epilogue loop.  */
6906   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6907       && reduc_inputs.length () == 1)
6908     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6909                                            { orig_reduc_input, reduc_info });
6910
6911   if (double_reduc)
6912     loop = outer_loop;
6913
6914   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6915           phis with new adjusted scalar results, i.e., replace use <s_out0>
6916           with use <s_out4>.
6917
6918      Transform:
6919         loop_exit:
6920           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6921           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6922           v_out2 = reduce <v_out1>
6923           s_out3 = extract_field <v_out2, 0>
6924           s_out4 = adjust_result <s_out3>
6925           use <s_out0>
6926           use <s_out0>
6927
6928      into:
6929
6930         loop_exit:
6931           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6932           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6933           v_out2 = reduce <v_out1>
6934           s_out3 = extract_field <v_out2, 0>
6935           s_out4 = adjust_result <s_out3>
6936           use <s_out4>
6937           use <s_out4> */
6938
6939   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6940   for (k = 0; k < live_out_stmts.size (); k++)
6941     {
6942       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6943       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6944
6945       phis.create (3);
6946       /* Find the loop-closed-use at the loop exit of the original scalar
6947          result.  (The reduction result is expected to have two immediate uses,
6948          one at the latch block, and one at the loop exit).  For double
6949          reductions we are looking for exit phis of the outer loop.  */
6950       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6951         {
6952           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6953             {
6954               if (!is_gimple_debug (USE_STMT (use_p)))
6955                 phis.safe_push (USE_STMT (use_p));
6956             }
6957           else
6958             {
6959               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6960                 {
6961                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6962
6963                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6964                     {
6965                       if (!flow_bb_inside_loop_p (loop,
6966                                              gimple_bb (USE_STMT (phi_use_p)))
6967                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6968                         phis.safe_push (USE_STMT (phi_use_p));
6969                     }
6970                 }
6971             }
6972         }
6973
6974       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6975         {
6976           /* Replace the uses:  */
6977           orig_name = PHI_RESULT (exit_phi);
6978
6979           /* Look for a single use at the target of the skip edge.  */
6980           if (unify_with_main_loop_p)
6981             {
6982               use_operand_p use_p;
6983               gimple *user;
6984               if (!single_imm_use (orig_name, &use_p, &user))
6985                 gcc_unreachable ();
6986               orig_name = gimple_get_lhs (user);
6987             }
6988
6989           scalar_result = scalar_results[k];
6990           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6991             {
6992               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6993                 SET_USE (use_p, scalar_result);
6994               update_stmt (use_stmt);
6995             }
6996         }
6997
6998       phis.release ();
6999     }
7000 }
7001
7002 /* Return a vector of type VECTYPE that is equal to the vector select
7003    operation "MASK ? VEC : IDENTITY".  Insert the select statements
7004    before GSI.  */
7005
7006 static tree
7007 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
7008                      tree vec, tree identity)
7009 {
7010   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
7011   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
7012                                           mask, vec, identity);
7013   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7014   return cond;
7015 }
7016
7017 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
7018    order, starting with LHS.  Insert the extraction statements before GSI and
7019    associate the new scalar SSA names with variable SCALAR_DEST.
7020    If MASK is nonzero mask the input and then operate on it unconditionally.
7021    Return the SSA name for the result.  */
7022
7023 static tree
7024 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
7025                        tree_code code, tree lhs, tree vector_rhs,
7026                        tree mask)
7027 {
7028   tree vectype = TREE_TYPE (vector_rhs);
7029   tree scalar_type = TREE_TYPE (vectype);
7030   tree bitsize = TYPE_SIZE (scalar_type);
7031   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
7032   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
7033
7034   /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
7035      to perform an unconditional element-wise reduction of it.  */
7036   if (mask)
7037     {
7038       tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
7039                                                    "masked_vector_rhs");
7040       tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
7041                                                   false);
7042       tree vector_identity = build_vector_from_val (vectype, neutral_op);
7043       gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
7044                                              mask, vector_rhs, vector_identity);
7045       gsi_insert_before (gsi, select, GSI_SAME_STMT);
7046       vector_rhs = masked_vector_rhs;
7047     }
7048
7049   for (unsigned HOST_WIDE_INT bit_offset = 0;
7050        bit_offset < vec_size_in_bits;
7051        bit_offset += element_bitsize)
7052     {
7053       tree bitpos = bitsize_int (bit_offset);
7054       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
7055                          bitsize, bitpos);
7056
7057       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
7058       rhs = make_ssa_name (scalar_dest, stmt);
7059       gimple_assign_set_lhs (stmt, rhs);
7060       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7061
7062       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
7063       tree new_name = make_ssa_name (scalar_dest, stmt);
7064       gimple_assign_set_lhs (stmt, new_name);
7065       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7066       lhs = new_name;
7067     }
7068   return lhs;
7069 }
7070
7071 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
7072    type of the vector input.  */
7073
7074 static internal_fn
7075 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
7076 {
7077   internal_fn mask_reduc_fn;
7078   internal_fn mask_len_reduc_fn;
7079
7080   switch (reduc_fn)
7081     {
7082     case IFN_FOLD_LEFT_PLUS:
7083       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
7084       mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
7085       break;
7086
7087     default:
7088       return IFN_LAST;
7089     }
7090
7091   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
7092                                       OPTIMIZE_FOR_SPEED))
7093     return mask_reduc_fn;
7094   if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
7095                                       OPTIMIZE_FOR_SPEED))
7096     return mask_len_reduc_fn;
7097   return IFN_LAST;
7098 }
7099
7100 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
7101    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
7102    statement.  CODE is the operation performed by STMT_INFO and OPS are
7103    its scalar operands.  REDUC_INDEX is the index of the operand in
7104    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
7105    implements in-order reduction, or IFN_LAST if we should open-code it.
7106    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
7107    that should be used to control the operation in a fully-masked loop.  */
7108
7109 static bool
7110 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
7111                                stmt_vec_info stmt_info,
7112                                gimple_stmt_iterator *gsi,
7113                                gimple **vec_stmt, slp_tree slp_node,
7114                                gimple *reduc_def_stmt,
7115                                code_helper code, internal_fn reduc_fn,
7116                                tree *ops, int num_ops, tree vectype_in,
7117                                int reduc_index, vec_loop_masks *masks,
7118                                vec_loop_lens *lens)
7119 {
7120   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7121   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7122   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
7123
7124   int ncopies;
7125   if (slp_node)
7126     ncopies = 1;
7127   else
7128     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7129
7130   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7131   gcc_assert (ncopies == 1);
7132
7133   bool is_cond_op = false;
7134   if (!code.is_tree_code ())
7135     {
7136       code = conditional_internal_fn_code (internal_fn (code));
7137       gcc_assert (code != ERROR_MARK);
7138       is_cond_op = true;
7139     }
7140
7141   gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7142
7143   if (slp_node)
7144     {
7145       if (is_cond_op)
7146         {
7147           if (dump_enabled_p ())
7148             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7149                              "fold-left reduction on SLP not supported.\n");
7150           return false;
7151         }
7152
7153       gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7154                             TYPE_VECTOR_SUBPARTS (vectype_in)));
7155     }
7156
7157   /* The operands either come from a binary operation or an IFN_COND operation.
7158      The former is a gimple assign with binary rhs and the latter is a
7159      gimple call with four arguments.  */
7160   gcc_assert (num_ops == 2 || num_ops == 4);
7161   tree op0, opmask;
7162   if (!is_cond_op)
7163     op0 = ops[1 - reduc_index];
7164   else
7165     {
7166       op0 = ops[2 + (1 - reduc_index)];
7167       opmask = ops[0];
7168       gcc_assert (!slp_node);
7169     }
7170
7171   int group_size = 1;
7172   stmt_vec_info scalar_dest_def_info;
7173   auto_vec<tree> vec_oprnds0, vec_opmask;
7174   if (slp_node)
7175     {
7176       auto_vec<vec<tree> > vec_defs (2);
7177       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
7178       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
7179       vec_defs[0].release ();
7180       vec_defs[1].release ();
7181       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7182       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7183     }
7184   else
7185     {
7186       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7187                                      op0, &vec_oprnds0);
7188       scalar_dest_def_info = stmt_info;
7189
7190       /* For an IFN_COND_OP we also need the vector mask operand.  */
7191       if (is_cond_op)
7192           vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7193                                          opmask, &vec_opmask);
7194     }
7195
7196   gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
7197   tree scalar_dest = gimple_get_lhs (sdef);
7198   tree scalar_type = TREE_TYPE (scalar_dest);
7199   tree reduc_var = gimple_phi_result (reduc_def_stmt);
7200
7201   int vec_num = vec_oprnds0.length ();
7202   gcc_assert (vec_num == 1 || slp_node);
7203   tree vec_elem_type = TREE_TYPE (vectype_out);
7204   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7205
7206   tree vector_identity = NULL_TREE;
7207   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7208     {
7209       vector_identity = build_zero_cst (vectype_out);
7210       if (!HONOR_SIGNED_ZEROS (vectype_out))
7211         ;
7212       else
7213         {
7214           gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7215           vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7216                                         vector_identity);
7217         }
7218     }
7219
7220   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7221   int i;
7222   tree def0;
7223   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7224     {
7225       gimple *new_stmt;
7226       tree mask = NULL_TREE;
7227       tree len = NULL_TREE;
7228       tree bias = NULL_TREE;
7229       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7230         mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7231       else if (is_cond_op)
7232         mask = vec_opmask[0];
7233       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7234         {
7235           len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7236                                    i, 1);
7237           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7238           bias = build_int_cst (intQI_type_node, biasval);
7239           if (!is_cond_op)
7240             mask = build_minus_one_cst (truth_type_for (vectype_in));
7241         }
7242
7243       /* Handle MINUS by adding the negative.  */
7244       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7245         {
7246           tree negated = make_ssa_name (vectype_out);
7247           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7248           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7249           def0 = negated;
7250         }
7251
7252       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7253           && mask && mask_reduc_fn == IFN_LAST)
7254         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7255                                     vector_identity);
7256
7257       /* On the first iteration the input is simply the scalar phi
7258          result, and for subsequent iterations it is the output of
7259          the preceding operation.  */
7260       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7261         {
7262           if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7263             new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7264                                                    def0, mask, len, bias);
7265           else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7266             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7267                                                    def0, mask);
7268           else
7269             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7270                                                    def0);
7271           /* For chained SLP reductions the output of the previous reduction
7272              operation serves as the input of the next. For the final statement
7273              the output cannot be a temporary - we reuse the original
7274              scalar destination of the last statement.  */
7275           if (i != vec_num - 1)
7276             {
7277               gimple_set_lhs (new_stmt, scalar_dest_var);
7278               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7279               gimple_set_lhs (new_stmt, reduc_var);
7280             }
7281         }
7282       else
7283         {
7284           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7285                                              tree_code (code), reduc_var, def0,
7286                                              mask);
7287           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7288           /* Remove the statement, so that we can use the same code paths
7289              as for statements that we've just created.  */
7290           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7291           gsi_remove (&tmp_gsi, true);
7292         }
7293
7294       if (i == vec_num - 1)
7295         {
7296           gimple_set_lhs (new_stmt, scalar_dest);
7297           vect_finish_replace_stmt (loop_vinfo,
7298                                     scalar_dest_def_info,
7299                                     new_stmt);
7300         }
7301       else
7302         vect_finish_stmt_generation (loop_vinfo,
7303                                      scalar_dest_def_info,
7304                                      new_stmt, gsi);
7305
7306       if (slp_node)
7307         slp_node->push_vec_def (new_stmt);
7308       else
7309         {
7310           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7311           *vec_stmt = new_stmt;
7312         }
7313     }
7314
7315   return true;
7316 }
7317
7318 /* Function is_nonwrapping_integer_induction.
7319
7320    Check if STMT_VINO (which is part of loop LOOP) both increments and
7321    does not cause overflow.  */
7322
7323 static bool
7324 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7325 {
7326   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7327   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7328   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7329   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7330   widest_int ni, max_loop_value, lhs_max;
7331   wi::overflow_type overflow = wi::OVF_NONE;
7332
7333   /* Make sure the loop is integer based.  */
7334   if (TREE_CODE (base) != INTEGER_CST
7335       || TREE_CODE (step) != INTEGER_CST)
7336     return false;
7337
7338   /* Check that the max size of the loop will not wrap.  */
7339
7340   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7341     return true;
7342
7343   if (! max_stmt_executions (loop, &ni))
7344     return false;
7345
7346   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7347                             &overflow);
7348   if (overflow)
7349     return false;
7350
7351   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7352                             TYPE_SIGN (lhs_type), &overflow);
7353   if (overflow)
7354     return false;
7355
7356   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7357           <= TYPE_PRECISION (lhs_type));
7358 }
7359
7360 /* Check if masking can be supported by inserting a conditional expression.
7361    CODE is the code for the operation.  COND_FN is the conditional internal
7362    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
7363 static bool
7364 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7365                          tree vectype_in)
7366 {
7367   if (cond_fn != IFN_LAST
7368       && direct_internal_fn_supported_p (cond_fn, vectype_in,
7369                                          OPTIMIZE_FOR_SPEED))
7370     return false;
7371
7372   if (code.is_tree_code ())
7373     switch (tree_code (code))
7374       {
7375       case DOT_PROD_EXPR:
7376       case SAD_EXPR:
7377         return true;
7378
7379       default:
7380         break;
7381       }
7382   return false;
7383 }
7384
7385 /* Insert a conditional expression to enable masked vectorization.  CODE is the
7386    code for the operation.  VOP is the array of operands.  MASK is the loop
7387    mask.  GSI is a statement iterator used to place the new conditional
7388    expression.  */
7389 static void
7390 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7391                       gimple_stmt_iterator *gsi)
7392 {
7393   switch (tree_code (code))
7394     {
7395     case DOT_PROD_EXPR:
7396       {
7397         tree vectype = TREE_TYPE (vop[1]);
7398         tree zero = build_zero_cst (vectype);
7399         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7400         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7401                                                mask, vop[1], zero);
7402         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7403         vop[1] = masked_op1;
7404         break;
7405       }
7406
7407     case SAD_EXPR:
7408       {
7409         tree vectype = TREE_TYPE (vop[1]);
7410         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7411         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7412                                                mask, vop[1], vop[0]);
7413         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7414         vop[1] = masked_op1;
7415         break;
7416       }
7417
7418     default:
7419       gcc_unreachable ();
7420     }
7421 }
7422
7423 /* Function vectorizable_reduction.
7424
7425    Check if STMT_INFO performs a reduction operation that can be vectorized.
7426    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7427    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7428    Return true if STMT_INFO is vectorizable in this way.
7429
7430    This function also handles reduction idioms (patterns) that have been
7431    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
7432    may be of this form:
7433      X = pattern_expr (arg0, arg1, ..., X)
7434    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7435    sequence that had been detected and replaced by the pattern-stmt
7436    (STMT_INFO).
7437
7438    This function also handles reduction of condition expressions, for example:
7439      for (int i = 0; i < N; i++)
7440        if (a[i] < value)
7441          last = a[i];
7442    This is handled by vectorising the loop and creating an additional vector
7443    containing the loop indexes for which "a[i] < value" was true.  In the
7444    function epilogue this is reduced to a single max value and then used to
7445    index into the vector of results.
7446
7447    In some cases of reduction patterns, the type of the reduction variable X is
7448    different than the type of the other arguments of STMT_INFO.
7449    In such cases, the vectype that is used when transforming STMT_INFO into
7450    a vector stmt is different than the vectype that is used to determine the
7451    vectorization factor, because it consists of a different number of elements
7452    than the actual number of elements that are being operated upon in parallel.
7453
7454    For example, consider an accumulation of shorts into an int accumulator.
7455    On some targets it's possible to vectorize this pattern operating on 8
7456    shorts at a time (hence, the vectype for purposes of determining the
7457    vectorization factor should be V8HI); on the other hand, the vectype that
7458    is used to create the vector form is actually V4SI (the type of the result).
7459
7460    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7461    indicates what is the actual level of parallelism (V8HI in the example), so
7462    that the right vectorization factor would be derived.  This vectype
7463    corresponds to the type of arguments to the reduction stmt, and should *NOT*
7464    be used to create the vectorized stmt.  The right vectype for the vectorized
7465    stmt is obtained from the type of the result X:
7466       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7467
7468    This means that, contrary to "regular" reductions (or "regular" stmts in
7469    general), the following equation:
7470       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7471    does *NOT* necessarily hold for reduction patterns.  */
7472
7473 bool
7474 vectorizable_reduction (loop_vec_info loop_vinfo,
7475                         stmt_vec_info stmt_info, slp_tree slp_node,
7476                         slp_instance slp_node_instance,
7477                         stmt_vector_for_cost *cost_vec)
7478 {
7479   tree vectype_in = NULL_TREE;
7480   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7481   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7482   stmt_vec_info cond_stmt_vinfo = NULL;
7483   int i;
7484   int ncopies;
7485   bool single_defuse_cycle = false;
7486   bool nested_cycle = false;
7487   bool double_reduc = false;
7488   int vec_num;
7489   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7490   tree cond_reduc_val = NULL_TREE;
7491
7492   /* Make sure it was already recognized as a reduction computation.  */
7493   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7494       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7495       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7496     return false;
7497
7498   /* The stmt we store reduction analysis meta on.  */
7499   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7500   reduc_info->is_reduc_info = true;
7501
7502   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7503     {
7504       if (is_a <gphi *> (stmt_info->stmt))
7505         {
7506           if (slp_node)
7507             {
7508               /* We eventually need to set a vector type on invariant
7509                  arguments.  */
7510               unsigned j;
7511               slp_tree child;
7512               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7513                 if (!vect_maybe_update_slp_op_vectype
7514                        (child, SLP_TREE_VECTYPE (slp_node)))
7515                   {
7516                     if (dump_enabled_p ())
7517                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7518                                        "incompatible vector types for "
7519                                        "invariants\n");
7520                     return false;
7521                   }
7522             }
7523           /* Analysis for double-reduction is done on the outer
7524              loop PHI, nested cycles have no further restrictions.  */
7525           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7526         }
7527       else
7528         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7529       return true;
7530     }
7531
7532   stmt_vec_info orig_stmt_of_analysis = stmt_info;
7533   stmt_vec_info phi_info = stmt_info;
7534   if (!is_a <gphi *> (stmt_info->stmt))
7535     {
7536       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7537       return true;
7538     }
7539   if (slp_node)
7540     {
7541       slp_node_instance->reduc_phis = slp_node;
7542       /* ???  We're leaving slp_node to point to the PHIs, we only
7543          need it to get at the number of vector stmts which wasn't
7544          yet initialized for the instance root.  */
7545     }
7546   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7547     {
7548       use_operand_p use_p;
7549       gimple *use_stmt;
7550       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7551                                  &use_p, &use_stmt);
7552       gcc_assert (res);
7553       phi_info = loop_vinfo->lookup_stmt (use_stmt);
7554     }
7555
7556   /* PHIs should not participate in patterns.  */
7557   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7558   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7559
7560   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7561      and compute the reduction chain length.  Discover the real
7562      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
7563   tree reduc_def
7564     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7565                              loop_latch_edge
7566                                (gimple_bb (reduc_def_phi)->loop_father));
7567   unsigned reduc_chain_length = 0;
7568   bool only_slp_reduc_chain = true;
7569   stmt_info = NULL;
7570   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7571   while (reduc_def != PHI_RESULT (reduc_def_phi))
7572     {
7573       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7574       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7575       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7576         {
7577           if (dump_enabled_p ())
7578             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7579                              "reduction chain broken by patterns.\n");
7580           return false;
7581         }
7582       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7583         only_slp_reduc_chain = false;
7584       /* For epilogue generation live members of the chain need
7585          to point back to the PHI via their original stmt for
7586          info_for_reduction to work.  For SLP we need to look at
7587          all lanes here - even though we only will vectorize from
7588          the SLP node with live lane zero the other live lanes also
7589          need to be identified as part of a reduction to be able
7590          to skip code generation for them.  */
7591       if (slp_for_stmt_info)
7592         {
7593           for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7594             if (STMT_VINFO_LIVE_P (s))
7595               STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7596         }
7597       else if (STMT_VINFO_LIVE_P (vdef))
7598         STMT_VINFO_REDUC_DEF (def) = phi_info;
7599       gimple_match_op op;
7600       if (!gimple_extract_op (vdef->stmt, &op))
7601         {
7602           if (dump_enabled_p ())
7603             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7604                              "reduction chain includes unsupported"
7605                              " statement type.\n");
7606           return false;
7607         }
7608       if (CONVERT_EXPR_CODE_P (op.code))
7609         {
7610           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7611             {
7612               if (dump_enabled_p ())
7613                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7614                                  "conversion in the reduction chain.\n");
7615               return false;
7616             }
7617         }
7618       else if (!stmt_info)
7619         /* First non-conversion stmt.  */
7620         stmt_info = vdef;
7621       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7622       reduc_chain_length++;
7623       if (!stmt_info && slp_node)
7624         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7625     }
7626   /* PHIs should not participate in patterns.  */
7627   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7628
7629   if (nested_in_vect_loop_p (loop, stmt_info))
7630     {
7631       loop = loop->inner;
7632       nested_cycle = true;
7633     }
7634
7635   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7636      element.  */
7637   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7638     {
7639       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7640       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7641     }
7642   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7643     gcc_assert (slp_node
7644                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7645
7646   /* 1. Is vectorizable reduction?  */
7647   /* Not supportable if the reduction variable is used in the loop, unless
7648      it's a reduction chain.  */
7649   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7650       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7651     return false;
7652
7653   /* Reductions that are not used even in an enclosing outer-loop,
7654      are expected to be "live" (used out of the loop).  */
7655   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7656       && !STMT_VINFO_LIVE_P (stmt_info))
7657     return false;
7658
7659   /* 2. Has this been recognized as a reduction pattern?
7660
7661      Check if STMT represents a pattern that has been recognized
7662      in earlier analysis stages.  For stmts that represent a pattern,
7663      the STMT_VINFO_RELATED_STMT field records the last stmt in
7664      the original sequence that constitutes the pattern.  */
7665
7666   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7667   if (orig_stmt_info)
7668     {
7669       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7670       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7671     }
7672
7673   /* 3. Check the operands of the operation.  The first operands are defined
7674         inside the loop body. The last operand is the reduction variable,
7675         which is defined by the loop-header-phi.  */
7676
7677   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7678   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7679   gimple_match_op op;
7680   if (!gimple_extract_op (stmt_info->stmt, &op))
7681     gcc_unreachable ();
7682   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7683                             || op.code == WIDEN_SUM_EXPR
7684                             || op.code == SAD_EXPR);
7685
7686   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7687       && !SCALAR_FLOAT_TYPE_P (op.type))
7688     return false;
7689
7690   /* Do not try to vectorize bit-precision reductions.  */
7691   if (!type_has_mode_precision_p (op.type))
7692     return false;
7693
7694   /* For lane-reducing ops we're reducing the number of reduction PHIs
7695      which means the only use of that may be in the lane-reducing operation.  */
7696   if (lane_reduc_code_p
7697       && reduc_chain_length != 1
7698       && !only_slp_reduc_chain)
7699     {
7700       if (dump_enabled_p ())
7701         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7702                          "lane-reducing reduction with extra stmts.\n");
7703       return false;
7704     }
7705
7706   /* All uses but the last are expected to be defined in the loop.
7707      The last use is the reduction variable.  In case of nested cycle this
7708      assumption is not true: we use reduc_index to record the index of the
7709      reduction variable.  */
7710   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7711   tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7712   /* We need to skip an extra operand for COND_EXPRs with embedded
7713      comparison.  */
7714   unsigned opno_adjust = 0;
7715   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7716     opno_adjust = 1;
7717   for (i = 0; i < (int) op.num_ops; i++)
7718     {
7719       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
7720       if (i == 0 && op.code == COND_EXPR)
7721         continue;
7722
7723       stmt_vec_info def_stmt_info;
7724       enum vect_def_type dt;
7725       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7726                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7727                                &vectype_op[i], &def_stmt_info))
7728         {
7729           if (dump_enabled_p ())
7730             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7731                              "use not simple.\n");
7732           return false;
7733         }
7734       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7735         continue;
7736
7737       /* For an IFN_COND_OP we might hit the reduction definition operand
7738          twice (once as definition, once as else).  */
7739       if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7740         continue;
7741
7742       /* There should be only one cycle def in the stmt, the one
7743          leading to reduc_def.  */
7744       if (VECTORIZABLE_CYCLE_DEF (dt))
7745         return false;
7746
7747       if (!vectype_op[i])
7748         vectype_op[i]
7749           = get_vectype_for_scalar_type (loop_vinfo,
7750                                          TREE_TYPE (op.ops[i]), slp_op[i]);
7751
7752       /* To properly compute ncopies we are interested in the widest
7753          non-reduction input type in case we're looking at a widening
7754          accumulation that we later handle in vect_transform_reduction.  */
7755       if (lane_reduc_code_p
7756           && vectype_op[i]
7757           && (!vectype_in
7758               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7759                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7760         vectype_in = vectype_op[i];
7761
7762       if (op.code == COND_EXPR)
7763         {
7764           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
7765           if (dt == vect_constant_def)
7766             {
7767               cond_reduc_dt = dt;
7768               cond_reduc_val = op.ops[i];
7769             }
7770           if (dt == vect_induction_def
7771               && def_stmt_info
7772               && is_nonwrapping_integer_induction (def_stmt_info, loop))
7773             {
7774               cond_reduc_dt = dt;
7775               cond_stmt_vinfo = def_stmt_info;
7776             }
7777         }
7778     }
7779   if (!vectype_in)
7780     vectype_in = STMT_VINFO_VECTYPE (phi_info);
7781   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7782
7783   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7784   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7785   /* If we have a condition reduction, see if we can simplify it further.  */
7786   if (v_reduc_type == COND_REDUCTION)
7787     {
7788       if (slp_node)
7789         return false;
7790
7791       /* When the condition uses the reduction value in the condition, fail.  */
7792       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7793         {
7794           if (dump_enabled_p ())
7795             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7796                              "condition depends on previous iteration\n");
7797           return false;
7798         }
7799
7800       if (reduc_chain_length == 1
7801           && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7802                                               OPTIMIZE_FOR_SPEED)
7803               || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7804                                                  vectype_in,
7805                                                  OPTIMIZE_FOR_SPEED)))
7806         {
7807           if (dump_enabled_p ())
7808             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7809                              "optimizing condition reduction with"
7810                              " FOLD_EXTRACT_LAST.\n");
7811           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7812         }
7813       else if (cond_reduc_dt == vect_induction_def)
7814         {
7815           tree base
7816             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7817           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7818
7819           gcc_assert (TREE_CODE (base) == INTEGER_CST
7820                       && TREE_CODE (step) == INTEGER_CST);
7821           cond_reduc_val = NULL_TREE;
7822           enum tree_code cond_reduc_op_code = ERROR_MARK;
7823           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7824           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7825             ;
7826           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7827              above base; punt if base is the minimum value of the type for
7828              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
7829           else if (tree_int_cst_sgn (step) == -1)
7830             {
7831               cond_reduc_op_code = MIN_EXPR;
7832               if (tree_int_cst_sgn (base) == -1)
7833                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7834               else if (tree_int_cst_lt (base,
7835                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
7836                 cond_reduc_val
7837                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
7838             }
7839           else
7840             {
7841               cond_reduc_op_code = MAX_EXPR;
7842               if (tree_int_cst_sgn (base) == 1)
7843                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7844               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7845                                         base))
7846                 cond_reduc_val
7847                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
7848             }
7849           if (cond_reduc_val)
7850             {
7851               if (dump_enabled_p ())
7852                 dump_printf_loc (MSG_NOTE, vect_location,
7853                                  "condition expression based on "
7854                                  "integer induction.\n");
7855               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7856               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7857                 = cond_reduc_val;
7858               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7859             }
7860         }
7861       else if (cond_reduc_dt == vect_constant_def)
7862         {
7863           enum vect_def_type cond_initial_dt;
7864           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7865           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7866           if (cond_initial_dt == vect_constant_def
7867               && types_compatible_p (TREE_TYPE (cond_initial_val),
7868                                      TREE_TYPE (cond_reduc_val)))
7869             {
7870               tree e = fold_binary (LE_EXPR, boolean_type_node,
7871                                     cond_initial_val, cond_reduc_val);
7872               if (e && (integer_onep (e) || integer_zerop (e)))
7873                 {
7874                   if (dump_enabled_p ())
7875                     dump_printf_loc (MSG_NOTE, vect_location,
7876                                      "condition expression based on "
7877                                      "compile time constant.\n");
7878                   /* Record reduction code at analysis stage.  */
7879                   STMT_VINFO_REDUC_CODE (reduc_info)
7880                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7881                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7882                 }
7883             }
7884         }
7885     }
7886
7887   if (STMT_VINFO_LIVE_P (phi_info))
7888     return false;
7889
7890   if (slp_node)
7891     ncopies = 1;
7892   else
7893     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7894
7895   gcc_assert (ncopies >= 1);
7896
7897   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7898
7899   if (nested_cycle)
7900     {
7901       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7902                   == vect_double_reduction_def);
7903       double_reduc = true;
7904     }
7905
7906   /* 4.2. Check support for the epilog operation.
7907
7908           If STMT represents a reduction pattern, then the type of the
7909           reduction variable may be different than the type of the rest
7910           of the arguments.  For example, consider the case of accumulation
7911           of shorts into an int accumulator; The original code:
7912                         S1: int_a = (int) short_a;
7913           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7914
7915           was replaced with:
7916                         STMT: int_acc = widen_sum <short_a, int_acc>
7917
7918           This means that:
7919           1. The tree-code that is used to create the vector operation in the
7920              epilog code (that reduces the partial results) is not the
7921              tree-code of STMT, but is rather the tree-code of the original
7922              stmt from the pattern that STMT is replacing.  I.e, in the example
7923              above we want to use 'widen_sum' in the loop, but 'plus' in the
7924              epilog.
7925           2. The type (mode) we use to check available target support
7926              for the vector operation to be created in the *epilog*, is
7927              determined by the type of the reduction variable (in the example
7928              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7929              However the type (mode) we use to check available target support
7930              for the vector operation to be created *inside the loop*, is
7931              determined by the type of the other arguments to STMT (in the
7932              example we'd check this: optab_handler (widen_sum_optab,
7933              vect_short_mode)).
7934
7935           This is contrary to "regular" reductions, in which the types of all
7936           the arguments are the same as the type of the reduction variable.
7937           For "regular" reductions we can therefore use the same vector type
7938           (and also the same tree-code) when generating the epilog code and
7939           when generating the code inside the loop.  */
7940
7941   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7942
7943   /* If conversion might have created a conditional operation like
7944      IFN_COND_ADD already.  Use the internal code for the following checks.  */
7945   if (orig_code.is_internal_fn ())
7946     {
7947       tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7948       orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7949     }
7950
7951   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7952
7953   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7954   if (reduction_type == TREE_CODE_REDUCTION)
7955     {
7956       /* Check whether it's ok to change the order of the computation.
7957          Generally, when vectorizing a reduction we change the order of the
7958          computation.  This may change the behavior of the program in some
7959          cases, so we need to check that this is ok.  One exception is when
7960          vectorizing an outer-loop: the inner-loop is executed sequentially,
7961          and therefore vectorizing reductions in the inner-loop during
7962          outer-loop vectorization is safe.  Likewise when we are vectorizing
7963          a series of reductions using SLP and the VF is one the reductions
7964          are performed in scalar order.  */
7965       if (slp_node
7966           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7967           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7968         ;
7969       else if (needs_fold_left_reduction_p (op.type, orig_code))
7970         {
7971           /* When vectorizing a reduction chain w/o SLP the reduction PHI
7972              is not directy used in stmt.  */
7973           if (!only_slp_reduc_chain
7974               && reduc_chain_length != 1)
7975             {
7976               if (dump_enabled_p ())
7977                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7978                                  "in-order reduction chain without SLP.\n");
7979               return false;
7980             }
7981           STMT_VINFO_REDUC_TYPE (reduc_info)
7982             = reduction_type = FOLD_LEFT_REDUCTION;
7983         }
7984       else if (!commutative_binary_op_p (orig_code, op.type)
7985                || !associative_binary_op_p (orig_code, op.type))
7986         {
7987           if (dump_enabled_p ())
7988             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7989                             "reduction: not commutative/associative\n");
7990           return false;
7991         }
7992     }
7993
7994   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7995       && ncopies > 1)
7996     {
7997       if (dump_enabled_p ())
7998         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7999                          "multiple types in double reduction or condition "
8000                          "reduction or fold-left reduction.\n");
8001       return false;
8002     }
8003
8004   internal_fn reduc_fn = IFN_LAST;
8005   if (reduction_type == TREE_CODE_REDUCTION
8006       || reduction_type == FOLD_LEFT_REDUCTION
8007       || reduction_type == INTEGER_INDUC_COND_REDUCTION
8008       || reduction_type == CONST_COND_REDUCTION)
8009     {
8010       if (reduction_type == FOLD_LEFT_REDUCTION
8011           ? fold_left_reduction_fn (orig_code, &reduc_fn)
8012           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
8013         {
8014           if (reduc_fn != IFN_LAST
8015               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
8016                                                   OPTIMIZE_FOR_SPEED))
8017             {
8018               if (dump_enabled_p ())
8019                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8020                                  "reduc op not supported by target.\n");
8021
8022               reduc_fn = IFN_LAST;
8023             }
8024         }
8025       else
8026         {
8027           if (!nested_cycle || double_reduc)
8028             {
8029               if (dump_enabled_p ())
8030                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8031                                  "no reduc code for scalar code.\n");
8032
8033               return false;
8034             }
8035         }
8036     }
8037   else if (reduction_type == COND_REDUCTION)
8038     {
8039       int scalar_precision
8040         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
8041       cr_index_scalar_type = make_unsigned_type (scalar_precision);
8042       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
8043                                                 vectype_out);
8044
8045       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
8046                                           OPTIMIZE_FOR_SPEED))
8047         reduc_fn = IFN_REDUC_MAX;
8048     }
8049   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
8050
8051   if (reduction_type != EXTRACT_LAST_REDUCTION
8052       && (!nested_cycle || double_reduc)
8053       && reduc_fn == IFN_LAST
8054       && !nunits_out.is_constant ())
8055     {
8056       if (dump_enabled_p ())
8057         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8058                          "missing target support for reduction on"
8059                          " variable-length vectors.\n");
8060       return false;
8061     }
8062
8063   /* For SLP reductions, see if there is a neutral value we can use.  */
8064   tree neutral_op = NULL_TREE;
8065   if (slp_node)
8066     {
8067       tree initial_value = NULL_TREE;
8068       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
8069         initial_value = vect_phi_initial_value (reduc_def_phi);
8070       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8071                                              orig_code, initial_value);
8072     }
8073
8074   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
8075     {
8076       /* We can't support in-order reductions of code such as this:
8077
8078            for (int i = 0; i < n1; ++i)
8079              for (int j = 0; j < n2; ++j)
8080                l += a[j];
8081
8082          since GCC effectively transforms the loop when vectorizing:
8083
8084            for (int i = 0; i < n1 / VF; ++i)
8085              for (int j = 0; j < n2; ++j)
8086                for (int k = 0; k < VF; ++k)
8087                  l += a[j];
8088
8089          which is a reassociation of the original operation.  */
8090       if (dump_enabled_p ())
8091         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8092                          "in-order double reduction not supported.\n");
8093
8094       return false;
8095     }
8096
8097   if (reduction_type == FOLD_LEFT_REDUCTION
8098       && slp_node
8099       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8100     {
8101       /* We cannot use in-order reductions in this case because there is
8102          an implicit reassociation of the operations involved.  */
8103       if (dump_enabled_p ())
8104         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8105                          "in-order unchained SLP reductions not supported.\n");
8106       return false;
8107     }
8108
8109   /* For double reductions, and for SLP reductions with a neutral value,
8110      we construct a variable-length initial vector by loading a vector
8111      full of the neutral value and then shift-and-inserting the start
8112      values into the low-numbered elements.  */
8113   if ((double_reduc || neutral_op)
8114       && !nunits_out.is_constant ()
8115       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8116                                           vectype_out, OPTIMIZE_FOR_SPEED))
8117     {
8118       if (dump_enabled_p ())
8119         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8120                          "reduction on variable-length vectors requires"
8121                          " target support for a vector-shift-and-insert"
8122                          " operation.\n");
8123       return false;
8124     }
8125
8126   /* Check extra constraints for variable-length unchained SLP reductions.  */
8127   if (slp_node
8128       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8129       && !nunits_out.is_constant ())
8130     {
8131       /* We checked above that we could build the initial vector when
8132          there's a neutral element value.  Check here for the case in
8133          which each SLP statement has its own initial value and in which
8134          that value needs to be repeated for every instance of the
8135          statement within the initial vector.  */
8136       unsigned int group_size = SLP_TREE_LANES (slp_node);
8137       if (!neutral_op
8138           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8139                                               TREE_TYPE (vectype_out)))
8140         {
8141           if (dump_enabled_p ())
8142             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8143                              "unsupported form of SLP reduction for"
8144                              " variable-length vectors: cannot build"
8145                              " initial vector.\n");
8146           return false;
8147         }
8148       /* The epilogue code relies on the number of elements being a multiple
8149          of the group size.  The duplicate-and-interleave approach to setting
8150          up the initial vector does too.  */
8151       if (!multiple_p (nunits_out, group_size))
8152         {
8153           if (dump_enabled_p ())
8154             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8155                              "unsupported form of SLP reduction for"
8156                              " variable-length vectors: the vector size"
8157                              " is not a multiple of the number of results.\n");
8158           return false;
8159         }
8160     }
8161
8162   if (reduction_type == COND_REDUCTION)
8163     {
8164       widest_int ni;
8165
8166       if (! max_loop_iterations (loop, &ni))
8167         {
8168           if (dump_enabled_p ())
8169             dump_printf_loc (MSG_NOTE, vect_location,
8170                              "loop count not known, cannot create cond "
8171                              "reduction.\n");
8172           return false;
8173         }
8174       /* Convert backedges to iterations.  */
8175       ni += 1;
8176
8177       /* The additional index will be the same type as the condition.  Check
8178          that the loop can fit into this less one (because we'll use up the
8179          zero slot for when there are no matches).  */
8180       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8181       if (wi::geu_p (ni, wi::to_widest (max_index)))
8182         {
8183           if (dump_enabled_p ())
8184             dump_printf_loc (MSG_NOTE, vect_location,
8185                              "loop size is greater than data size.\n");
8186           return false;
8187         }
8188     }
8189
8190   /* In case the vectorization factor (VF) is bigger than the number
8191      of elements that we can fit in a vectype (nunits), we have to generate
8192      more than one vector stmt - i.e - we need to "unroll" the
8193      vector stmt by a factor VF/nunits.  For more details see documentation
8194      in vectorizable_operation.  */
8195
8196   /* If the reduction is used in an outer loop we need to generate
8197      VF intermediate results, like so (e.g. for ncopies=2):
8198         r0 = phi (init, r0)
8199         r1 = phi (init, r1)
8200         r0 = x0 + r0;
8201         r1 = x1 + r1;
8202     (i.e. we generate VF results in 2 registers).
8203     In this case we have a separate def-use cycle for each copy, and therefore
8204     for each copy we get the vector def for the reduction variable from the
8205     respective phi node created for this copy.
8206
8207     Otherwise (the reduction is unused in the loop nest), we can combine
8208     together intermediate results, like so (e.g. for ncopies=2):
8209         r = phi (init, r)
8210         r = x0 + r;
8211         r = x1 + r;
8212    (i.e. we generate VF/2 results in a single register).
8213    In this case for each copy we get the vector def for the reduction variable
8214    from the vectorized reduction operation generated in the previous iteration.
8215
8216    This only works when we see both the reduction PHI and its only consumer
8217    in vectorizable_reduction and there are no intermediate stmts
8218    participating.  When unrolling we want each unrolled iteration to have its
8219    own reduction accumulator since one of the main goals of unrolling a
8220    reduction is to reduce the aggregate loop-carried latency.  */
8221   if (ncopies > 1
8222       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8223       && reduc_chain_length == 1
8224       && loop_vinfo->suggested_unroll_factor == 1)
8225     single_defuse_cycle = true;
8226
8227   if (single_defuse_cycle || lane_reduc_code_p)
8228     {
8229       gcc_assert (op.code != COND_EXPR);
8230
8231       /* 4. Supportable by target?  */
8232       bool ok = true;
8233
8234       /* 4.1. check support for the operation in the loop
8235
8236          This isn't necessary for the lane reduction codes, since they
8237          can only be produced by pattern matching, and it's up to the
8238          pattern matcher to test for support.  The main reason for
8239          specifically skipping this step is to avoid rechecking whether
8240          mixed-sign dot-products can be implemented using signed
8241          dot-products.  */
8242       machine_mode vec_mode = TYPE_MODE (vectype_in);
8243       if (!lane_reduc_code_p
8244           && !directly_supported_p (op.code, vectype_in, optab_vector))
8245         {
8246           if (dump_enabled_p ())
8247             dump_printf (MSG_NOTE, "op not supported by target.\n");
8248           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8249               || !vect_can_vectorize_without_simd_p (op.code))
8250             ok = false;
8251           else
8252             if (dump_enabled_p ())
8253               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8254         }
8255
8256       if (vect_emulated_vector_p (vectype_in)
8257           && !vect_can_vectorize_without_simd_p (op.code))
8258         {
8259           if (dump_enabled_p ())
8260             dump_printf (MSG_NOTE, "using word mode not possible.\n");
8261           return false;
8262         }
8263
8264       /* lane-reducing operations have to go through vect_transform_reduction.
8265          For the other cases try without the single cycle optimization.  */
8266       if (!ok)
8267         {
8268           if (lane_reduc_code_p)
8269             return false;
8270           else
8271             single_defuse_cycle = false;
8272         }
8273     }
8274   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8275
8276   /* If the reduction stmt is one of the patterns that have lane
8277      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
8278   if ((ncopies > 1 && ! single_defuse_cycle)
8279       && lane_reduc_code_p)
8280     {
8281       if (dump_enabled_p ())
8282         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8283                          "multi def-use cycle not possible for lane-reducing "
8284                          "reduction operation\n");
8285       return false;
8286     }
8287
8288   if (slp_node
8289       && !(!single_defuse_cycle
8290            && !lane_reduc_code_p
8291            && reduction_type != FOLD_LEFT_REDUCTION))
8292     for (i = 0; i < (int) op.num_ops; i++)
8293       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8294         {
8295           if (dump_enabled_p ())
8296             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8297                              "incompatible vector types for invariants\n");
8298           return false;
8299         }
8300
8301   if (slp_node)
8302     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8303   else
8304     vec_num = 1;
8305
8306   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8307                              reduction_type, ncopies, cost_vec);
8308   /* Cost the reduction op inside the loop if transformed via
8309      vect_transform_reduction.  Otherwise this is costed by the
8310      separate vectorizable_* routines.  */
8311   if (single_defuse_cycle || lane_reduc_code_p)
8312     {
8313       int factor = 1;
8314       if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8315         /* Three dot-products and a subtraction.  */
8316         factor = 4;
8317       record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8318                         stmt_info, 0, vect_body);
8319     }
8320
8321   if (dump_enabled_p ()
8322       && reduction_type == FOLD_LEFT_REDUCTION)
8323     dump_printf_loc (MSG_NOTE, vect_location,
8324                      "using an in-order (fold-left) reduction.\n");
8325   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8326   /* All but single defuse-cycle optimized, lane-reducing and fold-left
8327      reductions go through their own vectorizable_* routines.  */
8328   if (!single_defuse_cycle
8329       && !lane_reduc_code_p
8330       && reduction_type != FOLD_LEFT_REDUCTION)
8331     {
8332       stmt_vec_info tem
8333         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8334       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8335         {
8336           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8337           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8338         }
8339       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8340       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8341     }
8342   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8343     {
8344       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8345       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8346       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8347
8348       if (reduction_type != FOLD_LEFT_REDUCTION
8349           && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8350           && (cond_fn == IFN_LAST
8351               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8352                                                   OPTIMIZE_FOR_SPEED)))
8353         {
8354           if (dump_enabled_p ())
8355             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8356                              "can't operate on partial vectors because"
8357                              " no conditional operation is available.\n");
8358           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8359         }
8360       else if (reduction_type == FOLD_LEFT_REDUCTION
8361                && reduc_fn == IFN_LAST
8362                && !expand_vec_cond_expr_p (vectype_in,
8363                                            truth_type_for (vectype_in),
8364                                            SSA_NAME))
8365         {
8366           if (dump_enabled_p ())
8367             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8368                              "can't operate on partial vectors because"
8369                              " no conditional operation is available.\n");
8370           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8371         }
8372       else if (reduction_type == FOLD_LEFT_REDUCTION
8373                && internal_fn_mask_index (reduc_fn) == -1
8374                && FLOAT_TYPE_P (vectype_in)
8375                && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8376         {
8377           if (dump_enabled_p ())
8378             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8379                              "can't operate on partial vectors because"
8380                              " signed zeros cannot be preserved.\n");
8381           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8382         }
8383       else
8384         {
8385           internal_fn mask_reduc_fn
8386             = get_masked_reduction_fn (reduc_fn, vectype_in);
8387
8388           if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8389             vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8390                                   vectype_in, 1);
8391           else
8392             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8393                                    vectype_in, NULL);
8394         }
8395     }
8396   return true;
8397 }
8398
8399 /* STMT_INFO is a dot-product reduction whose multiplication operands
8400    have different signs.  Emit a sequence to emulate the operation
8401    using a series of signed DOT_PROD_EXPRs and return the last
8402    statement generated.  VEC_DEST is the result of the vector operation
8403    and VOP lists its inputs.  */
8404
8405 static gassign *
8406 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8407                              gimple_stmt_iterator *gsi, tree vec_dest,
8408                              tree vop[3])
8409 {
8410   tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8411   tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8412   tree narrow_elttype = TREE_TYPE (narrow_vectype);
8413   gimple *new_stmt;
8414
8415   /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
8416   if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8417     std::swap (vop[0], vop[1]);
8418
8419   /* Convert all inputs to signed types.  */
8420   for (int i = 0; i < 3; ++i)
8421     if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8422       {
8423         tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8424         new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8425         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8426         vop[i] = tmp;
8427       }
8428
8429   /* In the comments below we assume 8-bit inputs for simplicity,
8430      but the approach works for any full integer type.  */
8431
8432   /* Create a vector of -128.  */
8433   tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8434   tree min_narrow = build_vector_from_val (narrow_vectype,
8435                                            min_narrow_elttype);
8436
8437   /* Create a vector of 64.  */
8438   auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8439   tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8440   half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8441
8442   /* Emit: SUB_RES = VOP[0] - 128.  */
8443   tree sub_res = make_ssa_name (narrow_vectype);
8444   new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8445   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8446
8447   /* Emit:
8448
8449        STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8450        STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8451        STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8452
8453      on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8454      Doing the two 64 * y steps first allows more time to compute x.  */
8455   tree stage1 = make_ssa_name (wide_vectype);
8456   new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8457                                   vop[1], half_narrow, vop[2]);
8458   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8459
8460   tree stage2 = make_ssa_name (wide_vectype);
8461   new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8462                                   vop[1], half_narrow, stage1);
8463   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8464
8465   tree stage3 = make_ssa_name (wide_vectype);
8466   new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8467                                   sub_res, vop[1], stage2);
8468   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8469
8470   /* Convert STAGE3 to the reduction type.  */
8471   return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8472 }
8473
8474 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8475    value.  */
8476
8477 bool
8478 vect_transform_reduction (loop_vec_info loop_vinfo,
8479                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8480                           gimple **vec_stmt, slp_tree slp_node)
8481 {
8482   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8483   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8484   int i;
8485   int ncopies;
8486   int vec_num;
8487
8488   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8489   gcc_assert (reduc_info->is_reduc_info);
8490
8491   if (nested_in_vect_loop_p (loop, stmt_info))
8492     {
8493       loop = loop->inner;
8494       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8495     }
8496
8497   gimple_match_op op;
8498   if (!gimple_extract_op (stmt_info->stmt, &op))
8499     gcc_unreachable ();
8500
8501   /* All uses but the last are expected to be defined in the loop.
8502      The last use is the reduction variable.  In case of nested cycle this
8503      assumption is not true: we use reduc_index to record the index of the
8504      reduction variable.  */
8505   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8506   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8507   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8508   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8509
8510   if (slp_node)
8511     {
8512       ncopies = 1;
8513       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8514     }
8515   else
8516     {
8517       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8518       vec_num = 1;
8519     }
8520
8521   code_helper code = canonicalize_code (op.code, op.type);
8522   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8523
8524   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8525   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8526   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8527
8528   /* Transform.  */
8529   tree new_temp = NULL_TREE;
8530   auto_vec<tree> vec_oprnds0;
8531   auto_vec<tree> vec_oprnds1;
8532   auto_vec<tree> vec_oprnds2;
8533   tree def0;
8534
8535   if (dump_enabled_p ())
8536     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8537
8538   /* FORNOW: Multiple types are not supported for condition.  */
8539   if (code == COND_EXPR)
8540     gcc_assert (ncopies == 1);
8541
8542   /* A binary COND_OP reduction must have the same definition and else
8543      value. */
8544   bool cond_fn_p = code.is_internal_fn ()
8545     && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8546   if (cond_fn_p)
8547     {
8548       gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8549                   || code == IFN_COND_MUL || code == IFN_COND_AND
8550                   || code == IFN_COND_IOR || code == IFN_COND_XOR);
8551       gcc_assert (op.num_ops == 4
8552                   && (op.ops[reduc_index]
8553                       == op.ops[internal_fn_else_index ((internal_fn) code)]));
8554     }
8555
8556   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8557
8558   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8559   if (reduction_type == FOLD_LEFT_REDUCTION)
8560     {
8561       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8562       gcc_assert (code.is_tree_code () || cond_fn_p);
8563       return vectorize_fold_left_reduction
8564           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8565            code, reduc_fn, op.ops, op.num_ops, vectype_in,
8566            reduc_index, masks, lens);
8567     }
8568
8569   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8570   gcc_assert (single_defuse_cycle
8571               || code == DOT_PROD_EXPR
8572               || code == WIDEN_SUM_EXPR
8573               || code == SAD_EXPR);
8574
8575   /* Create the destination vector  */
8576   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8577   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8578
8579   /* Get NCOPIES vector definitions for all operands except the reduction
8580      definition.  */
8581   if (!cond_fn_p)
8582     {
8583       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8584                          single_defuse_cycle && reduc_index == 0
8585                          ? NULL_TREE : op.ops[0], &vec_oprnds0,
8586                          single_defuse_cycle && reduc_index == 1
8587                          ? NULL_TREE : op.ops[1], &vec_oprnds1,
8588                          op.num_ops == 3
8589                          && !(single_defuse_cycle && reduc_index == 2)
8590                          ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8591     }
8592   else
8593     {
8594       /* For a conditional operation pass the truth type as mask
8595          vectype.  */
8596       gcc_assert (single_defuse_cycle
8597                   && (reduc_index == 1 || reduc_index == 2));
8598       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8599                          op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
8600                          reduc_index == 1 ? NULL_TREE : op.ops[1],
8601                          NULL_TREE, &vec_oprnds1,
8602                          reduc_index == 2 ? NULL_TREE : op.ops[2],
8603                          NULL_TREE, &vec_oprnds2);
8604     }
8605
8606   /* For single def-use cycles get one copy of the vectorized reduction
8607      definition.  */
8608   if (single_defuse_cycle)
8609     {
8610       gcc_assert (!slp_node);
8611       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8612                                      op.ops[reduc_index],
8613                                      reduc_index == 0 ? &vec_oprnds0
8614                                      : (reduc_index == 1 ? &vec_oprnds1
8615                                         : &vec_oprnds2));
8616     }
8617
8618   bool emulated_mixed_dot_prod
8619     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8620   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8621     {
8622       gimple *new_stmt;
8623       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8624       if (masked_loop_p && !mask_by_cond_expr)
8625         {
8626           /* No conditional ifns have been defined for dot-product yet.  */
8627           gcc_assert (code != DOT_PROD_EXPR);
8628
8629           /* Make sure that the reduction accumulator is vop[0].  */
8630           if (reduc_index == 1)
8631             {
8632               gcc_assert (commutative_binary_op_p (code, op.type));
8633               std::swap (vop[0], vop[1]);
8634             }
8635           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8636                                           vec_num * ncopies, vectype_in, i);
8637           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8638                                                     vop[0], vop[1], vop[0]);
8639           new_temp = make_ssa_name (vec_dest, call);
8640           gimple_call_set_lhs (call, new_temp);
8641           gimple_call_set_nothrow (call, true);
8642           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8643           new_stmt = call;
8644         }
8645       else
8646         {
8647           if (op.num_ops >= 3)
8648             vop[2] = vec_oprnds2[i];
8649
8650           if (masked_loop_p && mask_by_cond_expr)
8651             {
8652               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8653                                               vec_num * ncopies, vectype_in, i);
8654               build_vect_cond_expr (code, vop, mask, gsi);
8655             }
8656
8657           if (emulated_mixed_dot_prod)
8658             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8659                                                     vec_dest, vop);
8660
8661           else if (code.is_internal_fn () && !cond_fn_p)
8662             new_stmt = gimple_build_call_internal (internal_fn (code),
8663                                                    op.num_ops,
8664                                                    vop[0], vop[1], vop[2]);
8665           else if (code.is_internal_fn () && cond_fn_p)
8666             new_stmt = gimple_build_call_internal (internal_fn (code),
8667                                                    op.num_ops,
8668                                                    vop[0], vop[1], vop[2],
8669                                                    vop[1]);
8670           else
8671             new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8672                                             vop[0], vop[1], vop[2]);
8673           new_temp = make_ssa_name (vec_dest, new_stmt);
8674           gimple_set_lhs (new_stmt, new_temp);
8675           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8676         }
8677
8678       if (slp_node)
8679         slp_node->push_vec_def (new_stmt);
8680       else if (single_defuse_cycle
8681                && i < ncopies - 1)
8682         {
8683           if (reduc_index == 0)
8684             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8685           else if (reduc_index == 1)
8686             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8687           else if (reduc_index == 2)
8688             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8689         }
8690       else
8691         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8692     }
8693
8694   if (!slp_node)
8695     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8696
8697   return true;
8698 }
8699
8700 /* Transform phase of a cycle PHI.  */
8701
8702 bool
8703 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8704                           stmt_vec_info stmt_info, gimple **vec_stmt,
8705                           slp_tree slp_node, slp_instance slp_node_instance)
8706 {
8707   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8708   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8709   int i;
8710   int ncopies;
8711   int j;
8712   bool nested_cycle = false;
8713   int vec_num;
8714
8715   if (nested_in_vect_loop_p (loop, stmt_info))
8716     {
8717       loop = loop->inner;
8718       nested_cycle = true;
8719     }
8720
8721   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8722   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8723   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8724   gcc_assert (reduc_info->is_reduc_info);
8725
8726   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8727       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8728     /* Leave the scalar phi in place.  */
8729     return true;
8730
8731   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8732   /* For a nested cycle we do not fill the above.  */
8733   if (!vectype_in)
8734     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8735   gcc_assert (vectype_in);
8736
8737   if (slp_node)
8738     {
8739       /* The size vect_schedule_slp_instance computes is off for us.  */
8740       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8741                                       * SLP_TREE_LANES (slp_node), vectype_in);
8742       ncopies = 1;
8743     }
8744   else
8745     {
8746       vec_num = 1;
8747       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8748     }
8749
8750   /* Check whether we should use a single PHI node and accumulate
8751      vectors to one before the backedge.  */
8752   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8753     ncopies = 1;
8754
8755   /* Create the destination vector  */
8756   gphi *phi = as_a <gphi *> (stmt_info->stmt);
8757   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8758                                                vectype_out);
8759
8760   /* Get the loop-entry arguments.  */
8761   tree vec_initial_def = NULL_TREE;
8762   auto_vec<tree> vec_initial_defs;
8763   if (slp_node)
8764     {
8765       vec_initial_defs.reserve (vec_num);
8766       if (nested_cycle)
8767         {
8768           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8769           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8770                              &vec_initial_defs);
8771         }
8772       else
8773         {
8774           gcc_assert (slp_node == slp_node_instance->reduc_phis);
8775           vec<tree> &initial_values = reduc_info->reduc_initial_values;
8776           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8777
8778           unsigned int num_phis = stmts.length ();
8779           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8780             num_phis = 1;
8781           initial_values.reserve (num_phis);
8782           for (unsigned int i = 0; i < num_phis; ++i)
8783             {
8784               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8785               initial_values.quick_push (vect_phi_initial_value (this_phi));
8786             }
8787           if (vec_num == 1)
8788             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8789           if (!initial_values.is_empty ())
8790             {
8791               tree initial_value
8792                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8793               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8794               tree neutral_op
8795                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8796                                             code, initial_value);
8797               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8798                                               &vec_initial_defs, vec_num,
8799                                               stmts.length (), neutral_op);
8800             }
8801         }
8802     }
8803   else
8804     {
8805       /* Get at the scalar def before the loop, that defines the initial
8806          value of the reduction variable.  */
8807       tree initial_def = vect_phi_initial_value (phi);
8808       reduc_info->reduc_initial_values.safe_push (initial_def);
8809       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8810          and we can't use zero for induc_val, use initial_def.  Similarly
8811          for REDUC_MIN and initial_def larger than the base.  */
8812       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8813         {
8814           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8815           if (TREE_CODE (initial_def) == INTEGER_CST
8816               && !integer_zerop (induc_val)
8817               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8818                    && tree_int_cst_lt (initial_def, induc_val))
8819                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8820                       && tree_int_cst_lt (induc_val, initial_def))))
8821             {
8822               induc_val = initial_def;
8823               /* Communicate we used the initial_def to epilouge
8824                  generation.  */
8825               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8826             }
8827           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8828         }
8829       else if (nested_cycle)
8830         {
8831           /* Do not use an adjustment def as that case is not supported
8832              correctly if ncopies is not one.  */
8833           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8834                                          ncopies, initial_def,
8835                                          &vec_initial_defs);
8836         }
8837       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8838                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8839         /* Fill the initial vector with the initial scalar value.  */
8840         vec_initial_def
8841           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8842                                            initial_def, initial_def);
8843       else
8844         {
8845           if (ncopies == 1)
8846             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8847           if (!reduc_info->reduc_initial_values.is_empty ())
8848             {
8849               initial_def = reduc_info->reduc_initial_values[0];
8850               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8851               tree neutral_op
8852                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8853                                             code, initial_def);
8854               gcc_assert (neutral_op);
8855               /* Try to simplify the vector initialization by applying an
8856                  adjustment after the reduction has been performed.  */
8857               if (!reduc_info->reused_accumulator
8858                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8859                   && !operand_equal_p (neutral_op, initial_def))
8860                 {
8861                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8862                     = initial_def;
8863                   initial_def = neutral_op;
8864                 }
8865               vec_initial_def
8866                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8867                                                  initial_def, neutral_op);
8868             }
8869         }
8870     }
8871
8872   if (vec_initial_def)
8873     {
8874       vec_initial_defs.create (ncopies);
8875       for (i = 0; i < ncopies; ++i)
8876         vec_initial_defs.quick_push (vec_initial_def);
8877     }
8878
8879   if (auto *accumulator = reduc_info->reused_accumulator)
8880     {
8881       tree def = accumulator->reduc_input;
8882       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8883         {
8884           unsigned int nreduc;
8885           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8886                                             (TREE_TYPE (def)),
8887                                           TYPE_VECTOR_SUBPARTS (vectype_out),
8888                                           &nreduc);
8889           gcc_assert (res);
8890           gimple_seq stmts = NULL;
8891           /* Reduce the single vector to a smaller one.  */
8892           if (nreduc != 1)
8893             {
8894               /* Perform the reduction in the appropriate type.  */
8895               tree rvectype = vectype_out;
8896               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8897                                               TREE_TYPE (TREE_TYPE (def))))
8898                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8899                                               TYPE_VECTOR_SUBPARTS
8900                                                 (vectype_out));
8901               def = vect_create_partial_epilog (def, rvectype,
8902                                                 STMT_VINFO_REDUC_CODE
8903                                                   (reduc_info),
8904                                                 &stmts);
8905             }
8906           /* The epilogue loop might use a different vector mode, like
8907              VNx2DI vs. V2DI.  */
8908           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8909             {
8910               tree reduc_type = build_vector_type_for_mode
8911                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8912               def = gimple_convert (&stmts, reduc_type, def);
8913             }
8914           /* Adjust the input so we pick up the partially reduced value
8915              for the skip edge in vect_create_epilog_for_reduction.  */
8916           accumulator->reduc_input = def;
8917           /* And the reduction could be carried out using a different sign.  */
8918           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8919             def = gimple_convert (&stmts, vectype_out, def);
8920           if (loop_vinfo->main_loop_edge)
8921             {
8922               /* While we'd like to insert on the edge this will split
8923                  blocks and disturb bookkeeping, we also will eventually
8924                  need this on the skip edge.  Rely on sinking to
8925                  fixup optimal placement and insert in the pred.  */
8926               gimple_stmt_iterator gsi
8927                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8928               /* Insert before a cond that eventually skips the
8929                  epilogue.  */
8930               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8931                 gsi_prev (&gsi);
8932               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8933             }
8934           else
8935             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8936                                               stmts);
8937         }
8938       if (loop_vinfo->main_loop_edge)
8939         vec_initial_defs[0]
8940           = vect_get_main_loop_result (loop_vinfo, def,
8941                                        vec_initial_defs[0]);
8942       else
8943         vec_initial_defs.safe_push (def);
8944     }
8945
8946   /* Generate the reduction PHIs upfront.  */
8947   for (i = 0; i < vec_num; i++)
8948     {
8949       tree vec_init_def = vec_initial_defs[i];
8950       for (j = 0; j < ncopies; j++)
8951         {
8952           /* Create the reduction-phi that defines the reduction
8953              operand.  */
8954           gphi *new_phi = create_phi_node (vec_dest, loop->header);
8955
8956           /* Set the loop-entry arg of the reduction-phi.  */
8957           if (j != 0 && nested_cycle)
8958             vec_init_def = vec_initial_defs[j];
8959           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8960                        UNKNOWN_LOCATION);
8961
8962           /* The loop-latch arg is set in epilogue processing.  */
8963
8964           if (slp_node)
8965             slp_node->push_vec_def (new_phi);
8966           else
8967             {
8968               if (j == 0)
8969                 *vec_stmt = new_phi;
8970               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8971             }
8972         }
8973     }
8974
8975   return true;
8976 }
8977
8978 /* Vectorizes LC PHIs.  */
8979
8980 bool
8981 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8982                      stmt_vec_info stmt_info, gimple **vec_stmt,
8983                      slp_tree slp_node)
8984 {
8985   if (!loop_vinfo
8986       || !is_a <gphi *> (stmt_info->stmt)
8987       || gimple_phi_num_args (stmt_info->stmt) != 1)
8988     return false;
8989
8990   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8991       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8992     return false;
8993
8994   if (!vec_stmt) /* transformation not required.  */
8995     {
8996       /* Deal with copies from externs or constants that disguise as
8997          loop-closed PHI nodes (PR97886).  */
8998       if (slp_node
8999           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
9000                                                 SLP_TREE_VECTYPE (slp_node)))
9001         {
9002           if (dump_enabled_p ())
9003             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9004                              "incompatible vector types for invariants\n");
9005           return false;
9006         }
9007       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
9008       return true;
9009     }
9010
9011   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9012   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9013   basic_block bb = gimple_bb (stmt_info->stmt);
9014   edge e = single_pred_edge (bb);
9015   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9016   auto_vec<tree> vec_oprnds;
9017   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
9018                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
9019                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
9020   for (unsigned i = 0; i < vec_oprnds.length (); i++)
9021     {
9022       /* Create the vectorized LC PHI node.  */
9023       gphi *new_phi = create_phi_node (vec_dest, bb);
9024       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
9025       if (slp_node)
9026         slp_node->push_vec_def (new_phi);
9027       else
9028         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
9029     }
9030   if (!slp_node)
9031     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9032
9033   return true;
9034 }
9035
9036 /* Vectorizes PHIs.  */
9037
9038 bool
9039 vectorizable_phi (vec_info *,
9040                   stmt_vec_info stmt_info, gimple **vec_stmt,
9041                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9042 {
9043   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
9044     return false;
9045
9046   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
9047     return false;
9048
9049   tree vectype = SLP_TREE_VECTYPE (slp_node);
9050
9051   if (!vec_stmt) /* transformation not required.  */
9052     {
9053       slp_tree child;
9054       unsigned i;
9055       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
9056         if (!child)
9057           {
9058             if (dump_enabled_p ())
9059               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9060                                "PHI node with unvectorized backedge def\n");
9061             return false;
9062           }
9063         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
9064           {
9065             if (dump_enabled_p ())
9066               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9067                                "incompatible vector types for invariants\n");
9068             return false;
9069           }
9070         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9071                  && !useless_type_conversion_p (vectype,
9072                                                 SLP_TREE_VECTYPE (child)))
9073           {
9074             /* With bools we can have mask and non-mask precision vectors
9075                or different non-mask precisions.  while pattern recog is
9076                supposed to guarantee consistency here bugs in it can cause
9077                mismatches (PR103489 and PR103800 for example).
9078                Deal with them here instead of ICEing later.  */
9079             if (dump_enabled_p ())
9080               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9081                                "incompatible vector type setup from "
9082                                "bool pattern detection\n");
9083             return false;
9084           }
9085
9086       /* For single-argument PHIs assume coalescing which means zero cost
9087          for the scalar and the vector PHIs.  This avoids artificially
9088          favoring the vector path (but may pessimize it in some cases).  */
9089       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
9090         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9091                           vector_stmt, stmt_info, vectype, 0, vect_body);
9092       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
9093       return true;
9094     }
9095
9096   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9097   basic_block bb = gimple_bb (stmt_info->stmt);
9098   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9099   auto_vec<gphi *> new_phis;
9100   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
9101     {
9102       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9103
9104       /* Skip not yet vectorized defs.  */
9105       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9106           && SLP_TREE_VEC_DEFS (child).is_empty ())
9107         continue;
9108
9109       auto_vec<tree> vec_oprnds;
9110       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
9111       if (!new_phis.exists ())
9112         {
9113           new_phis.create (vec_oprnds.length ());
9114           for (unsigned j = 0; j < vec_oprnds.length (); j++)
9115             {
9116               /* Create the vectorized LC PHI node.  */
9117               new_phis.quick_push (create_phi_node (vec_dest, bb));
9118               slp_node->push_vec_def (new_phis[j]);
9119             }
9120         }
9121       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
9122       for (unsigned j = 0; j < vec_oprnds.length (); j++)
9123         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9124     }
9125   /* We should have at least one already vectorized child.  */
9126   gcc_assert (new_phis.exists ());
9127
9128   return true;
9129 }
9130
9131 /* Vectorizes first order recurrences.  An overview of the transformation
9132    is described below. Suppose we have the following loop.
9133
9134      int t = 0;
9135      for (int i = 0; i < n; ++i)
9136        {
9137          b[i] = a[i] - t;
9138          t = a[i];
9139        }
9140
9141    There is a first-order recurrence on 'a'. For this loop, the scalar IR
9142    looks (simplified) like:
9143
9144     scalar.preheader:
9145       init = 0;
9146
9147     scalar.body:
9148       i = PHI <0(scalar.preheader), i+1(scalar.body)>
9149       _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9150       _1 = a[i]
9151       b[i] = _1 - _2
9152       if (i < n) goto scalar.body
9153
9154    In this example, _2 is a recurrence because it's value depends on the
9155    previous iteration.  We vectorize this as (VF = 4)
9156
9157     vector.preheader:
9158       vect_init = vect_cst(..., ..., ..., 0)
9159
9160     vector.body
9161       i = PHI <0(vector.preheader), i+4(vector.body)>
9162       vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9163       vect_2 = a[i, i+1, i+2, i+3];
9164       vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9165       b[i, i+1, i+2, i+3] = vect_2 - vect_3
9166       if (..) goto vector.body
9167
9168    In this function, vectorizable_recurr, we code generate both the
9169    vector PHI node and the permute since those together compute the
9170    vectorized value of the scalar PHI.  We do not yet have the
9171    backedge value to fill in there nor into the vec_perm.  Those
9172    are filled in maybe_set_vectorized_backedge_value and
9173    vect_schedule_scc.
9174
9175    TODO:  Since the scalar loop does not have a use of the recurrence
9176    outside of the loop the natural way to implement peeling via
9177    vectorizing the live value doesn't work.  For now peeling of loops
9178    with a recurrence is not implemented.  For SLP the supported cases
9179    are restricted to those requiring a single vector recurrence PHI.  */
9180
9181 bool
9182 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9183                      gimple **vec_stmt, slp_tree slp_node,
9184                      stmt_vector_for_cost *cost_vec)
9185 {
9186   if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9187     return false;
9188
9189   gphi *phi = as_a<gphi *> (stmt_info->stmt);
9190
9191   /* So far we only support first-order recurrence auto-vectorization.  */
9192   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9193     return false;
9194
9195   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9196   unsigned ncopies;
9197   if (slp_node)
9198     ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9199   else
9200     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9201   poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9202   unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9203   /* We need to be able to make progress with a single vector.  */
9204   if (maybe_gt (dist * 2, nunits))
9205     {
9206       if (dump_enabled_p ())
9207         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9208                          "first order recurrence exceeds half of "
9209                          "a vector\n");
9210       return false;
9211     }
9212
9213   /* First-order recurrence autovectorization needs to handle permutation
9214      with indices = [nunits-1, nunits, nunits+1, ...].  */
9215   vec_perm_builder sel (nunits, 1, 3);
9216   for (int i = 0; i < 3; ++i)
9217     sel.quick_push (nunits - dist + i);
9218   vec_perm_indices indices (sel, 2, nunits);
9219
9220   if (!vec_stmt) /* transformation not required.  */
9221     {
9222       if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9223                                  indices))
9224         return false;
9225
9226       if (slp_node)
9227         {
9228           /* We eventually need to set a vector type on invariant
9229              arguments.  */
9230           unsigned j;
9231           slp_tree child;
9232           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9233             if (!vect_maybe_update_slp_op_vectype
9234                   (child, SLP_TREE_VECTYPE (slp_node)))
9235               {
9236                 if (dump_enabled_p ())
9237                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9238                                    "incompatible vector types for "
9239                                    "invariants\n");
9240                 return false;
9241               }
9242         }
9243       /* The recurrence costs the initialization vector and one permute
9244          for each copy.  */
9245       unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9246                                                  stmt_info, 0, vect_prologue);
9247       unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9248                                                stmt_info, 0, vect_body);
9249       if (dump_enabled_p ())
9250         dump_printf_loc (MSG_NOTE, vect_location,
9251                          "vectorizable_recurr: inside_cost = %d, "
9252                          "prologue_cost = %d .\n", inside_cost,
9253                          prologue_cost);
9254
9255       STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9256       return true;
9257     }
9258
9259   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9260   basic_block bb = gimple_bb (phi);
9261   tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9262   if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9263     {
9264       gimple_seq stmts = NULL;
9265       preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9266       gsi_insert_seq_on_edge_immediate (pe, stmts);
9267     }
9268   tree vec_init = build_vector_from_val (vectype, preheader);
9269   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9270
9271   /* Create the vectorized first-order PHI node.  */
9272   tree vec_dest = vect_get_new_vect_var (vectype,
9273                                          vect_simple_var, "vec_recur_");
9274   gphi *new_phi = create_phi_node (vec_dest, bb);
9275   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9276
9277   /* Insert shuffles the first-order recurrence autovectorization.
9278        result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>.  */
9279   tree perm = vect_gen_perm_mask_checked (vectype, indices);
9280
9281   /* Insert the required permute after the latch definition.  The
9282      second and later operands are tentative and will be updated when we have
9283      vectorized the latch definition.  */
9284   edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9285   gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9286   gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9287   gsi_next (&gsi2);
9288
9289   for (unsigned i = 0; i < ncopies; ++i)
9290     {
9291       vec_dest = make_ssa_name (vectype);
9292       gassign *vperm
9293           = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9294                                  i == 0 ? gimple_phi_result (new_phi) : NULL,
9295                                  NULL, perm);
9296       vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9297
9298       if (slp_node)
9299         slp_node->push_vec_def (vperm);
9300       else
9301         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9302     }
9303
9304   if (!slp_node)
9305     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9306   return true;
9307 }
9308
9309 /* Return true if VECTYPE represents a vector that requires lowering
9310    by the vector lowering pass.  */
9311
9312 bool
9313 vect_emulated_vector_p (tree vectype)
9314 {
9315   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9316           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9317               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9318 }
9319
9320 /* Return true if we can emulate CODE on an integer mode representation
9321    of a vector.  */
9322
9323 bool
9324 vect_can_vectorize_without_simd_p (tree_code code)
9325 {
9326   switch (code)
9327     {
9328     case PLUS_EXPR:
9329     case MINUS_EXPR:
9330     case NEGATE_EXPR:
9331     case BIT_AND_EXPR:
9332     case BIT_IOR_EXPR:
9333     case BIT_XOR_EXPR:
9334     case BIT_NOT_EXPR:
9335       return true;
9336
9337     default:
9338       return false;
9339     }
9340 }
9341
9342 /* Likewise, but taking a code_helper.  */
9343
9344 bool
9345 vect_can_vectorize_without_simd_p (code_helper code)
9346 {
9347   return (code.is_tree_code ()
9348           && vect_can_vectorize_without_simd_p (tree_code (code)));
9349 }
9350
9351 /* Create vector init for vectorized iv.  */
9352 static tree
9353 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9354                                tree step_expr, poly_uint64 nunits,
9355                                tree vectype,
9356                                enum vect_induction_op_type induction_type)
9357 {
9358   unsigned HOST_WIDE_INT const_nunits;
9359   tree vec_shift, vec_init, new_name;
9360   unsigned i;
9361   tree itype = TREE_TYPE (vectype);
9362
9363   /* iv_loop is the loop to be vectorized. Create:
9364      vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr).  */
9365   new_name = gimple_convert (stmts, itype, init_expr);
9366   switch (induction_type)
9367     {
9368     case vect_step_op_shr:
9369     case vect_step_op_shl:
9370       /* Build the Initial value from shift_expr.  */
9371       vec_init = gimple_build_vector_from_val (stmts,
9372                                                vectype,
9373                                                new_name);
9374       vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9375                                 build_zero_cst (itype), step_expr);
9376       vec_init = gimple_build (stmts,
9377                                (induction_type == vect_step_op_shr
9378                                 ? RSHIFT_EXPR : LSHIFT_EXPR),
9379                                vectype, vec_init, vec_shift);
9380       break;
9381
9382     case vect_step_op_neg:
9383       {
9384         vec_init = gimple_build_vector_from_val (stmts,
9385                                                  vectype,
9386                                                  new_name);
9387         tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9388                                      vectype, vec_init);
9389         /* The encoding has 2 interleaved stepped patterns.  */
9390         vec_perm_builder sel (nunits, 2, 3);
9391         sel.quick_grow (6);
9392         for (i = 0; i < 3; i++)
9393           {
9394             sel[2 * i] = i;
9395             sel[2 * i + 1] = i + nunits;
9396           }
9397         vec_perm_indices indices (sel, 2, nunits);
9398         /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9399            fail when vec_init is const vector. In that situation vec_perm is not
9400            really needed.  */
9401         tree perm_mask_even
9402           = vect_gen_perm_mask_any (vectype, indices);
9403         vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9404                                  vectype,
9405                                  vec_init, vec_neg,
9406                                  perm_mask_even);
9407       }
9408       break;
9409
9410     case vect_step_op_mul:
9411       {
9412         /* Use unsigned mult to avoid UD integer overflow.  */
9413         gcc_assert (nunits.is_constant (&const_nunits));
9414         tree utype = unsigned_type_for (itype);
9415         tree uvectype = build_vector_type (utype,
9416                                            TYPE_VECTOR_SUBPARTS (vectype));
9417         new_name = gimple_convert (stmts, utype, new_name);
9418         vec_init = gimple_build_vector_from_val (stmts,
9419                                                  uvectype,
9420                                                  new_name);
9421         tree_vector_builder elts (uvectype, const_nunits, 1);
9422         tree elt_step = build_one_cst (utype);
9423
9424         elts.quick_push (elt_step);
9425         for (i = 1; i < const_nunits; i++)
9426           {
9427             /* Create: new_name_i = new_name + step_expr.  */
9428             elt_step = gimple_build (stmts, MULT_EXPR,
9429                                      utype, elt_step, step_expr);
9430             elts.quick_push (elt_step);
9431           }
9432         /* Create a vector from [new_name_0, new_name_1, ...,
9433            new_name_nunits-1].  */
9434         tree vec_mul = gimple_build_vector (stmts, &elts);
9435         vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9436                                  vec_init, vec_mul);
9437         vec_init = gimple_convert (stmts, vectype, vec_init);
9438       }
9439       break;
9440
9441     default:
9442       gcc_unreachable ();
9443     }
9444
9445   return vec_init;
9446 }
9447
9448 /* Peel init_expr by skip_niter for induction_type.  */
9449 tree
9450 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9451                              tree skip_niters, tree step_expr,
9452                              enum vect_induction_op_type induction_type)
9453 {
9454   gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9455   tree type = TREE_TYPE (init_expr);
9456   unsigned prec = TYPE_PRECISION (type);
9457   switch (induction_type)
9458     {
9459     case vect_step_op_neg:
9460       if (TREE_INT_CST_LOW (skip_niters) % 2)
9461         init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9462       /* else no change.  */
9463       break;
9464
9465     case vect_step_op_shr:
9466     case vect_step_op_shl:
9467       skip_niters = gimple_convert (stmts, type, skip_niters);
9468       step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9469       /* When shift mount >= precision, need to avoid UD.
9470          In the original loop, there's no UD, and according to semantic,
9471          init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr.  */
9472       if (!tree_fits_uhwi_p (step_expr)
9473           || tree_to_uhwi (step_expr) >= prec)
9474         {
9475           if (induction_type == vect_step_op_shl
9476               || TYPE_UNSIGNED (type))
9477             init_expr = build_zero_cst (type);
9478           else
9479             init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9480                                       init_expr,
9481                                       wide_int_to_tree (type, prec - 1));
9482         }
9483       else
9484         init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9485                                           ? RSHIFT_EXPR : LSHIFT_EXPR),
9486                                   type, init_expr, step_expr);
9487       break;
9488
9489     case vect_step_op_mul:
9490       {
9491         tree utype = unsigned_type_for (type);
9492         init_expr = gimple_convert (stmts, utype, init_expr);
9493         wide_int skipn = wi::to_wide (skip_niters);
9494         wide_int begin = wi::to_wide (step_expr);
9495         auto_mpz base, exp, mod, res;
9496         wi::to_mpz (begin, base, TYPE_SIGN (type));
9497         wi::to_mpz (skipn, exp, UNSIGNED);
9498         mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9499         mpz_powm (res, base, exp, mod);
9500         begin = wi::from_mpz (type, res, TYPE_SIGN (type));
9501         tree mult_expr = wide_int_to_tree (utype, begin);
9502         init_expr = gimple_build (stmts, MULT_EXPR, utype,
9503                                   init_expr, mult_expr);
9504         init_expr = gimple_convert (stmts, type, init_expr);
9505       }
9506       break;
9507
9508     default:
9509       gcc_unreachable ();
9510     }
9511
9512   return init_expr;
9513 }
9514
9515 /* Create vector step for vectorized iv.  */
9516 static tree
9517 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9518                                poly_uint64 vf,
9519                                enum vect_induction_op_type induction_type)
9520 {
9521   tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9522   tree new_name = NULL;
9523   /* Step should be pow (step, vf) for mult induction.  */
9524   if (induction_type == vect_step_op_mul)
9525     {
9526       gcc_assert (vf.is_constant ());
9527       wide_int begin = wi::to_wide (step_expr);
9528
9529       for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9530         begin = wi::mul (begin, wi::to_wide (step_expr));
9531
9532       new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9533     }
9534   else if (induction_type == vect_step_op_neg)
9535     /* Do nothing.  */
9536     ;
9537   else
9538     new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9539                              expr, step_expr);
9540   return new_name;
9541 }
9542
9543 static tree
9544 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9545                                    stmt_vec_info stmt_info,
9546                                    tree new_name, tree vectype,
9547                                    enum vect_induction_op_type induction_type)
9548 {
9549   /* No step is needed for neg induction.  */
9550   if (induction_type == vect_step_op_neg)
9551     return NULL;
9552
9553   tree t = unshare_expr (new_name);
9554   gcc_assert (CONSTANT_CLASS_P (new_name)
9555               || TREE_CODE (new_name) == SSA_NAME);
9556   tree new_vec = build_vector_from_val (vectype, t);
9557   tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9558                                     new_vec, vectype, NULL);
9559   return vec_step;
9560 }
9561
9562 /* Update vectorized iv with vect_step, induc_def is init.  */
9563 static tree
9564 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9565                           tree induc_def, tree vec_step,
9566                           enum vect_induction_op_type induction_type)
9567 {
9568   tree vec_def = induc_def;
9569   switch (induction_type)
9570     {
9571     case vect_step_op_mul:
9572       {
9573         /* Use unsigned mult to avoid UD integer overflow.  */
9574         tree uvectype
9575           = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9576                                TYPE_VECTOR_SUBPARTS (vectype));
9577         vec_def = gimple_convert (stmts, uvectype, vec_def);
9578         vec_step = gimple_convert (stmts, uvectype, vec_step);
9579         vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9580                                 vec_def, vec_step);
9581         vec_def = gimple_convert (stmts, vectype, vec_def);
9582       }
9583       break;
9584
9585     case vect_step_op_shr:
9586       vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9587                               vec_def, vec_step);
9588       break;
9589
9590     case vect_step_op_shl:
9591       vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9592                               vec_def, vec_step);
9593       break;
9594     case vect_step_op_neg:
9595       vec_def = induc_def;
9596       /* Do nothing.  */
9597       break;
9598     default:
9599       gcc_unreachable ();
9600     }
9601
9602   return vec_def;
9603
9604 }
9605
9606 /* Function vectorizable_induction
9607
9608    Check if STMT_INFO performs an nonlinear induction computation that can be
9609    vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9610    a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9611    basic block.
9612    Return true if STMT_INFO is vectorizable in this way.  */
9613
9614 static bool
9615 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9616                                   stmt_vec_info stmt_info,
9617                                   gimple **vec_stmt, slp_tree slp_node,
9618                                   stmt_vector_for_cost *cost_vec)
9619 {
9620   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9621   unsigned ncopies;
9622   bool nested_in_vect_loop = false;
9623   class loop *iv_loop;
9624   tree vec_def;
9625   edge pe = loop_preheader_edge (loop);
9626   basic_block new_bb;
9627   tree vec_init, vec_step;
9628   tree new_name;
9629   gimple *new_stmt;
9630   gphi *induction_phi;
9631   tree induc_def, vec_dest;
9632   tree init_expr, step_expr;
9633   tree niters_skip;
9634   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9635   unsigned i;
9636   gimple_stmt_iterator si;
9637
9638   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9639
9640   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9641   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9642   enum vect_induction_op_type induction_type
9643     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9644
9645   gcc_assert (induction_type > vect_step_op_add);
9646
9647   if (slp_node)
9648     ncopies = 1;
9649   else
9650     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9651   gcc_assert (ncopies >= 1);
9652
9653   /* FORNOW. Only handle nonlinear induction in the same loop.  */
9654   if (nested_in_vect_loop_p (loop, stmt_info))
9655     {
9656       if (dump_enabled_p ())
9657         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9658                          "nonlinear induction in nested loop.\n");
9659       return false;
9660     }
9661
9662   iv_loop = loop;
9663   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9664
9665   /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9666      update for each iv and a permutation to generate wanted vector iv.  */
9667   if (slp_node)
9668     {
9669       if (dump_enabled_p ())
9670         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9671                          "SLP induction not supported for nonlinear"
9672                          " induction.\n");
9673       return false;
9674     }
9675
9676   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9677     {
9678       if (dump_enabled_p ())
9679         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9680                          "floating point nonlinear induction vectorization"
9681                          " not supported.\n");
9682       return false;
9683     }
9684
9685   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9686   init_expr = vect_phi_initial_value (phi);
9687   gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9688               && TREE_CODE (step_expr) == INTEGER_CST);
9689   /* step_expr should be aligned with init_expr,
9690      .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used.  */
9691   step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9692
9693   if (TREE_CODE (init_expr) == INTEGER_CST)
9694     init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9695   else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9696     {
9697       /* INIT_EXPR could be a bit_field, bail out for such case.  */
9698       if (dump_enabled_p ())
9699         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9700                          "nonlinear induction vectorization failed:"
9701                          " component type of vectype is not a nop conversion"
9702                          " from type of init_expr.\n");
9703       return false;
9704     }
9705
9706   switch (induction_type)
9707     {
9708     case vect_step_op_neg:
9709       if (TREE_CODE (init_expr) != INTEGER_CST
9710           && TREE_CODE (init_expr) != REAL_CST)
9711         {
9712           /* Check for backend support of NEGATE_EXPR and vec_perm.  */
9713           if (!directly_supported_p (NEGATE_EXPR, vectype))
9714             return false;
9715
9716           /* The encoding has 2 interleaved stepped patterns.  */
9717           vec_perm_builder sel (nunits, 2, 3);
9718           machine_mode mode = TYPE_MODE (vectype);
9719           sel.quick_grow (6);
9720           for (i = 0; i < 3; i++)
9721             {
9722               sel[i * 2] = i;
9723               sel[i * 2 + 1] = i + nunits;
9724             }
9725           vec_perm_indices indices (sel, 2, nunits);
9726           if (!can_vec_perm_const_p (mode, mode, indices))
9727             return false;
9728         }
9729       break;
9730
9731     case vect_step_op_mul:
9732       {
9733         /* Check for backend support of MULT_EXPR.  */
9734         if (!directly_supported_p (MULT_EXPR, vectype))
9735           return false;
9736
9737         /* ?? How to construct vector step for variable number vector.
9738            [ 1, step, pow (step, 2), pow (step, 4), .. ].  */
9739         if (!vf.is_constant ())
9740           return false;
9741       }
9742       break;
9743
9744     case vect_step_op_shr:
9745       /* Check for backend support of RSHIFT_EXPR.  */
9746       if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9747         return false;
9748
9749       /* Don't shift more than type precision to avoid UD.  */
9750       if (!tree_fits_uhwi_p (step_expr)
9751           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9752                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9753         return false;
9754       break;
9755
9756     case vect_step_op_shl:
9757       /* Check for backend support of RSHIFT_EXPR.  */
9758       if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9759         return false;
9760
9761       /* Don't shift more than type precision to avoid UD.  */
9762       if (!tree_fits_uhwi_p (step_expr)
9763           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9764                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9765         return false;
9766
9767       break;
9768
9769     default:
9770       gcc_unreachable ();
9771     }
9772
9773   if (!vec_stmt) /* transformation not required.  */
9774     {
9775       unsigned inside_cost = 0, prologue_cost = 0;
9776       /* loop cost for vec_loop. Neg induction doesn't have any
9777          inside_cost.  */
9778       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9779                                       stmt_info, 0, vect_body);
9780
9781       /* loop cost for vec_loop. Neg induction doesn't have any
9782          inside_cost.  */
9783       if (induction_type == vect_step_op_neg)
9784         inside_cost = 0;
9785
9786       /* prologue cost for vec_init and vec_step.  */
9787       prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9788                                         stmt_info, 0, vect_prologue);
9789
9790       if (dump_enabled_p ())
9791         dump_printf_loc (MSG_NOTE, vect_location,
9792                          "vect_model_induction_cost: inside_cost = %d, "
9793                          "prologue_cost = %d. \n", inside_cost,
9794                          prologue_cost);
9795
9796       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9797       DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9798       return true;
9799     }
9800
9801   /* Transform.  */
9802
9803   /* Compute a vector variable, initialized with the first VF values of
9804      the induction variable.  E.g., for an iv with IV_PHI='X' and
9805      evolution S, for a vector of 4 units, we want to compute:
9806      [X, X + S, X + 2*S, X + 3*S].  */
9807
9808   if (dump_enabled_p ())
9809     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9810
9811   pe = loop_preheader_edge (iv_loop);
9812   /* Find the first insertion point in the BB.  */
9813   basic_block bb = gimple_bb (phi);
9814   si = gsi_after_labels (bb);
9815
9816   gimple_seq stmts = NULL;
9817
9818   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9819   /* If we are using the loop mask to "peel" for alignment then we need
9820      to adjust the start value here.  */
9821   if (niters_skip != NULL_TREE)
9822     init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9823                                              step_expr, induction_type);
9824
9825   vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9826                                             step_expr, nunits, vectype,
9827                                             induction_type);
9828   if (stmts)
9829     {
9830       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9831       gcc_assert (!new_bb);
9832     }
9833
9834   stmts = NULL;
9835   new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9836                                             vf, induction_type);
9837   if (stmts)
9838     {
9839       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9840       gcc_assert (!new_bb);
9841     }
9842
9843   vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9844                                                 new_name, vectype,
9845                                                 induction_type);
9846   /* Create the following def-use cycle:
9847      loop prolog:
9848      vec_init = ...
9849      vec_step = ...
9850      loop:
9851      vec_iv = PHI <vec_init, vec_loop>
9852      ...
9853      STMT
9854      ...
9855      vec_loop = vec_iv + vec_step;  */
9856
9857   /* Create the induction-phi that defines the induction-operand.  */
9858   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9859   induction_phi = create_phi_node (vec_dest, iv_loop->header);
9860   induc_def = PHI_RESULT (induction_phi);
9861
9862   /* Create the iv update inside the loop.  */
9863   stmts = NULL;
9864   vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9865                                       induc_def, vec_step,
9866                                       induction_type);
9867
9868   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9869   new_stmt = SSA_NAME_DEF_STMT (vec_def);
9870
9871   /* Set the arguments of the phi node:  */
9872   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9873   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9874                UNKNOWN_LOCATION);
9875
9876   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9877   *vec_stmt = induction_phi;
9878
9879   /* In case that vectorization factor (VF) is bigger than the number
9880      of elements that we can fit in a vectype (nunits), we have to generate
9881      more than one vector stmt - i.e - we need to "unroll" the
9882      vector stmt by a factor VF/nunits.  For more details see documentation
9883      in vectorizable_operation.  */
9884
9885   if (ncopies > 1)
9886     {
9887       stmts = NULL;
9888       /* FORNOW. This restriction should be relaxed.  */
9889       gcc_assert (!nested_in_vect_loop);
9890
9891       new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9892                                                 nunits, induction_type);
9893
9894       vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9895                                                     new_name, vectype,
9896                                                     induction_type);
9897       vec_def = induc_def;
9898       for (i = 1; i < ncopies; i++)
9899         {
9900           /* vec_i = vec_prev + vec_step.  */
9901           stmts = NULL;
9902           vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9903                                               vec_def, vec_step,
9904                                               induction_type);
9905           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9906           new_stmt = SSA_NAME_DEF_STMT (vec_def);
9907           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9908         }
9909     }
9910
9911   if (dump_enabled_p ())
9912     dump_printf_loc (MSG_NOTE, vect_location,
9913                      "transform induction: created def-use cycle: %G%G",
9914                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9915
9916   return true;
9917 }
9918
9919 /* Function vectorizable_induction
9920
9921    Check if STMT_INFO performs an induction computation that can be vectorized.
9922    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9923    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9924    Return true if STMT_INFO is vectorizable in this way.  */
9925
9926 bool
9927 vectorizable_induction (loop_vec_info loop_vinfo,
9928                         stmt_vec_info stmt_info,
9929                         gimple **vec_stmt, slp_tree slp_node,
9930                         stmt_vector_for_cost *cost_vec)
9931 {
9932   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9933   unsigned ncopies;
9934   bool nested_in_vect_loop = false;
9935   class loop *iv_loop;
9936   tree vec_def;
9937   edge pe = loop_preheader_edge (loop);
9938   basic_block new_bb;
9939   tree new_vec, vec_init, vec_step, t;
9940   tree new_name;
9941   gimple *new_stmt;
9942   gphi *induction_phi;
9943   tree induc_def, vec_dest;
9944   tree init_expr, step_expr;
9945   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9946   unsigned i;
9947   tree expr;
9948   gimple_stmt_iterator si;
9949   enum vect_induction_op_type induction_type
9950     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9951
9952   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9953   if (!phi)
9954     return false;
9955
9956   if (!STMT_VINFO_RELEVANT_P (stmt_info))
9957     return false;
9958
9959   /* Make sure it was recognized as induction computation.  */
9960   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9961     return false;
9962
9963   /* Handle nonlinear induction in a separate place.  */
9964   if (induction_type != vect_step_op_add)
9965     return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9966                                              vec_stmt, slp_node, cost_vec);
9967
9968   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9969   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9970
9971   if (slp_node)
9972     ncopies = 1;
9973   else
9974     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9975   gcc_assert (ncopies >= 1);
9976
9977   /* FORNOW. These restrictions should be relaxed.  */
9978   if (nested_in_vect_loop_p (loop, stmt_info))
9979     {
9980       imm_use_iterator imm_iter;
9981       use_operand_p use_p;
9982       gimple *exit_phi;
9983       edge latch_e;
9984       tree loop_arg;
9985
9986       if (ncopies > 1)
9987         {
9988           if (dump_enabled_p ())
9989             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9990                              "multiple types in nested loop.\n");
9991           return false;
9992         }
9993
9994       exit_phi = NULL;
9995       latch_e = loop_latch_edge (loop->inner);
9996       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9997       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9998         {
9999           gimple *use_stmt = USE_STMT (use_p);
10000           if (is_gimple_debug (use_stmt))
10001             continue;
10002
10003           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
10004             {
10005               exit_phi = use_stmt;
10006               break;
10007             }
10008         }
10009       if (exit_phi)
10010         {
10011           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
10012           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
10013                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
10014             {
10015               if (dump_enabled_p ())
10016                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10017                                  "inner-loop induction only used outside "
10018                                  "of the outer vectorized loop.\n");
10019               return false;
10020             }
10021         }
10022
10023       nested_in_vect_loop = true;
10024       iv_loop = loop->inner;
10025     }
10026   else
10027     iv_loop = loop;
10028   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
10029
10030   if (slp_node && !nunits.is_constant ())
10031     {
10032       /* The current SLP code creates the step value element-by-element.  */
10033       if (dump_enabled_p ())
10034         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10035                          "SLP induction not supported for variable-length"
10036                          " vectors.\n");
10037       return false;
10038     }
10039
10040   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
10041     {
10042       if (dump_enabled_p ())
10043         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10044                          "floating point induction vectorization disabled\n");
10045       return false;
10046     }
10047
10048   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
10049   gcc_assert (step_expr != NULL_TREE);
10050   if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
10051       && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
10052     {
10053       if (dump_enabled_p ())
10054         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10055                          "bit-precision induction vectorization not "
10056                          "supported.\n");
10057       return false;
10058     }
10059   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
10060
10061   /* Check for backend support of PLUS/MINUS_EXPR. */
10062   if (!directly_supported_p (PLUS_EXPR, step_vectype)
10063       || !directly_supported_p (MINUS_EXPR, step_vectype))
10064     return false;
10065
10066   if (!vec_stmt) /* transformation not required.  */
10067     {
10068       unsigned inside_cost = 0, prologue_cost = 0;
10069       if (slp_node)
10070         {
10071           /* We eventually need to set a vector type on invariant
10072              arguments.  */
10073           unsigned j;
10074           slp_tree child;
10075           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
10076             if (!vect_maybe_update_slp_op_vectype
10077                 (child, SLP_TREE_VECTYPE (slp_node)))
10078               {
10079                 if (dump_enabled_p ())
10080                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10081                                    "incompatible vector types for "
10082                                    "invariants\n");
10083                 return false;
10084               }
10085           /* loop cost for vec_loop.  */
10086           inside_cost
10087             = record_stmt_cost (cost_vec,
10088                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
10089                                 vector_stmt, stmt_info, 0, vect_body);
10090           /* prologue cost for vec_init (if not nested) and step.  */
10091           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
10092                                             scalar_to_vec,
10093                                             stmt_info, 0, vect_prologue);
10094         }
10095       else /* if (!slp_node) */
10096         {
10097           /* loop cost for vec_loop.  */
10098           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
10099                                           stmt_info, 0, vect_body);
10100           /* prologue cost for vec_init and vec_step.  */
10101           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
10102                                             stmt_info, 0, vect_prologue);
10103         }
10104       if (dump_enabled_p ())
10105         dump_printf_loc (MSG_NOTE, vect_location,
10106                          "vect_model_induction_cost: inside_cost = %d, "
10107                          "prologue_cost = %d .\n", inside_cost,
10108                          prologue_cost);
10109
10110       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10111       DUMP_VECT_SCOPE ("vectorizable_induction");
10112       return true;
10113     }
10114
10115   /* Transform.  */
10116
10117   /* Compute a vector variable, initialized with the first VF values of
10118      the induction variable.  E.g., for an iv with IV_PHI='X' and
10119      evolution S, for a vector of 4 units, we want to compute:
10120      [X, X + S, X + 2*S, X + 3*S].  */
10121
10122   if (dump_enabled_p ())
10123     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10124
10125   pe = loop_preheader_edge (iv_loop);
10126   /* Find the first insertion point in the BB.  */
10127   basic_block bb = gimple_bb (phi);
10128   si = gsi_after_labels (bb);
10129
10130   /* For SLP induction we have to generate several IVs as for example
10131      with group size 3 we need
10132        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10133        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
10134   if (slp_node)
10135     {
10136       /* Enforced above.  */
10137       unsigned int const_nunits = nunits.to_constant ();
10138
10139       /* The initial values are vectorized, but any lanes > group_size
10140          need adjustment.  */
10141       slp_tree init_node
10142         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10143
10144       /* Gather steps.  Since we do not vectorize inductions as
10145          cycles we have to reconstruct the step from SCEV data.  */
10146       unsigned group_size = SLP_TREE_LANES (slp_node);
10147       tree *steps = XALLOCAVEC (tree, group_size);
10148       tree *inits = XALLOCAVEC (tree, group_size);
10149       stmt_vec_info phi_info;
10150       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10151         {
10152           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10153           if (!init_node)
10154             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
10155                                            pe->dest_idx);
10156         }
10157
10158       /* Now generate the IVs.  */
10159       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10160       gcc_assert ((const_nunits * nvects) % group_size == 0);
10161       unsigned nivs;
10162       if (nested_in_vect_loop)
10163         nivs = nvects;
10164       else
10165         {
10166           /* Compute the number of distinct IVs we need.  First reduce
10167              group_size if it is a multiple of const_nunits so we get
10168              one IV for a group_size of 4 but const_nunits 2.  */
10169           unsigned group_sizep = group_size;
10170           if (group_sizep % const_nunits == 0)
10171             group_sizep = group_sizep / const_nunits;
10172           nivs = least_common_multiple (group_sizep,
10173                                         const_nunits) / const_nunits;
10174         }
10175       tree stept = TREE_TYPE (step_vectype);
10176       tree lupdate_mul = NULL_TREE;
10177       if (!nested_in_vect_loop)
10178         {
10179           /* The number of iterations covered in one vector iteration.  */
10180           unsigned lup_mul = (nvects * const_nunits) / group_size;
10181           lupdate_mul
10182             = build_vector_from_val (step_vectype,
10183                                      SCALAR_FLOAT_TYPE_P (stept)
10184                                      ? build_real_from_wide (stept, lup_mul,
10185                                                              UNSIGNED)
10186                                      : build_int_cstu (stept, lup_mul));
10187         }
10188       tree peel_mul = NULL_TREE;
10189       gimple_seq init_stmts = NULL;
10190       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10191         {
10192           if (SCALAR_FLOAT_TYPE_P (stept))
10193             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10194                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10195           else
10196             peel_mul = gimple_convert (&init_stmts, stept,
10197                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10198           peel_mul = gimple_build_vector_from_val (&init_stmts,
10199                                                    step_vectype, peel_mul);
10200         }
10201       unsigned ivn;
10202       auto_vec<tree> vec_steps;
10203       for (ivn = 0; ivn < nivs; ++ivn)
10204         {
10205           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10206           tree_vector_builder init_elts (vectype, const_nunits, 1);
10207           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10208           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10209             {
10210               /* The scalar steps of the IVs.  */
10211               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10212               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10213               step_elts.quick_push (elt);
10214               if (!init_node)
10215                 {
10216                   /* The scalar inits of the IVs if not vectorized.  */
10217                   elt = inits[(ivn*const_nunits + eltn) % group_size];
10218                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
10219                                                   TREE_TYPE (elt)))
10220                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10221                                         TREE_TYPE (vectype), elt);
10222                   init_elts.quick_push (elt);
10223                 }
10224               /* The number of steps to add to the initial values.  */
10225               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10226               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10227                                    ? build_real_from_wide (stept,
10228                                                            mul_elt, UNSIGNED)
10229                                    : build_int_cstu (stept, mul_elt));
10230             }
10231           vec_step = gimple_build_vector (&init_stmts, &step_elts);
10232           vec_steps.safe_push (vec_step);
10233           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10234           if (peel_mul)
10235             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10236                                      step_mul, peel_mul);
10237           if (!init_node)
10238             vec_init = gimple_build_vector (&init_stmts, &init_elts);
10239
10240           /* Create the induction-phi that defines the induction-operand.  */
10241           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10242                                             "vec_iv_");
10243           induction_phi = create_phi_node (vec_dest, iv_loop->header);
10244           induc_def = PHI_RESULT (induction_phi);
10245
10246           /* Create the iv update inside the loop  */
10247           tree up = vec_step;
10248           if (lupdate_mul)
10249             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10250                                vec_step, lupdate_mul);
10251           gimple_seq stmts = NULL;
10252           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10253           vec_def = gimple_build (&stmts,
10254                                   PLUS_EXPR, step_vectype, vec_def, up);
10255           vec_def = gimple_convert (&stmts, vectype, vec_def);
10256           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10257           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10258                        UNKNOWN_LOCATION);
10259
10260           if (init_node)
10261             vec_init = vect_get_slp_vect_def (init_node, ivn);
10262           if (!nested_in_vect_loop
10263               && !integer_zerop (step_mul))
10264             {
10265               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10266               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10267                                  vec_step, step_mul);
10268               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10269                                       vec_def, up);
10270               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10271             }
10272
10273           /* Set the arguments of the phi node:  */
10274           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10275
10276           slp_node->push_vec_def (induction_phi);
10277         }
10278       if (!nested_in_vect_loop)
10279         {
10280           /* Fill up to the number of vectors we need for the whole group.  */
10281           nivs = least_common_multiple (group_size,
10282                                         const_nunits) / const_nunits;
10283           vec_steps.reserve (nivs-ivn);
10284           for (; ivn < nivs; ++ivn)
10285             {
10286               slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10287               vec_steps.quick_push (vec_steps[0]);
10288             }
10289         }
10290
10291       /* Re-use IVs when we can.  We are generating further vector
10292          stmts by adding VF' * stride to the IVs generated above.  */
10293       if (ivn < nvects)
10294         {
10295           unsigned vfp
10296             = least_common_multiple (group_size, const_nunits) / group_size;
10297           tree lupdate_mul
10298             = build_vector_from_val (step_vectype,
10299                                      SCALAR_FLOAT_TYPE_P (stept)
10300                                      ? build_real_from_wide (stept,
10301                                                              vfp, UNSIGNED)
10302                                      : build_int_cstu (stept, vfp));
10303           for (; ivn < nvects; ++ivn)
10304             {
10305               gimple *iv
10306                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10307               tree def = gimple_get_lhs (iv);
10308               if (ivn < 2*nivs)
10309                 vec_steps[ivn - nivs]
10310                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10311                                   vec_steps[ivn - nivs], lupdate_mul);
10312               gimple_seq stmts = NULL;
10313               def = gimple_convert (&stmts, step_vectype, def);
10314               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10315                                   def, vec_steps[ivn % nivs]);
10316               def = gimple_convert (&stmts, vectype, def);
10317               if (gimple_code (iv) == GIMPLE_PHI)
10318                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10319               else
10320                 {
10321                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10322                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10323                 }
10324               slp_node->push_vec_def (def);
10325             }
10326         }
10327
10328       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10329       gcc_assert (!new_bb);
10330
10331       return true;
10332     }
10333
10334   init_expr = vect_phi_initial_value (phi);
10335
10336   gimple_seq stmts = NULL;
10337   if (!nested_in_vect_loop)
10338     {
10339       /* Convert the initial value to the IV update type.  */
10340       tree new_type = TREE_TYPE (step_expr);
10341       init_expr = gimple_convert (&stmts, new_type, init_expr);
10342
10343       /* If we are using the loop mask to "peel" for alignment then we need
10344          to adjust the start value here.  */
10345       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10346       if (skip_niters != NULL_TREE)
10347         {
10348           if (FLOAT_TYPE_P (vectype))
10349             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10350                                         skip_niters);
10351           else
10352             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10353           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10354                                          skip_niters, step_expr);
10355           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10356                                     init_expr, skip_step);
10357         }
10358     }
10359
10360   if (stmts)
10361     {
10362       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10363       gcc_assert (!new_bb);
10364     }
10365
10366   /* Create the vector that holds the initial_value of the induction.  */
10367   if (nested_in_vect_loop)
10368     {
10369       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
10370          been created during vectorization of previous stmts.  We obtain it
10371          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
10372       auto_vec<tree> vec_inits;
10373       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10374                                      init_expr, &vec_inits);
10375       vec_init = vec_inits[0];
10376       /* If the initial value is not of proper type, convert it.  */
10377       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10378         {
10379           new_stmt
10380             = gimple_build_assign (vect_get_new_ssa_name (vectype,
10381                                                           vect_simple_var,
10382                                                           "vec_iv_"),
10383                                    VIEW_CONVERT_EXPR,
10384                                    build1 (VIEW_CONVERT_EXPR, vectype,
10385                                            vec_init));
10386           vec_init = gimple_assign_lhs (new_stmt);
10387           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10388                                                  new_stmt);
10389           gcc_assert (!new_bb);
10390         }
10391     }
10392   else
10393     {
10394       /* iv_loop is the loop to be vectorized. Create:
10395          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
10396       stmts = NULL;
10397       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10398
10399       unsigned HOST_WIDE_INT const_nunits;
10400       if (nunits.is_constant (&const_nunits))
10401         {
10402           tree_vector_builder elts (step_vectype, const_nunits, 1);
10403           elts.quick_push (new_name);
10404           for (i = 1; i < const_nunits; i++)
10405             {
10406               /* Create: new_name_i = new_name + step_expr  */
10407               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10408                                        new_name, step_expr);
10409               elts.quick_push (new_name);
10410             }
10411           /* Create a vector from [new_name_0, new_name_1, ...,
10412              new_name_nunits-1]  */
10413           vec_init = gimple_build_vector (&stmts, &elts);
10414         }
10415       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10416         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
10417         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10418                                  new_name, step_expr);
10419       else
10420         {
10421           /* Build:
10422                 [base, base, base, ...]
10423                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
10424           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10425           gcc_assert (flag_associative_math);
10426           tree index = build_index_vector (step_vectype, 0, 1);
10427           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10428                                                         new_name);
10429           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10430                                                         step_expr);
10431           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10432           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10433                                    vec_init, step_vec);
10434           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10435                                    vec_init, base_vec);
10436         }
10437       vec_init = gimple_convert (&stmts, vectype, vec_init);
10438
10439       if (stmts)
10440         {
10441           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10442           gcc_assert (!new_bb);
10443         }
10444     }
10445
10446
10447   /* Create the vector that holds the step of the induction.  */
10448   gimple_stmt_iterator *step_iv_si = NULL;
10449   if (nested_in_vect_loop)
10450     /* iv_loop is nested in the loop to be vectorized. Generate:
10451        vec_step = [S, S, S, S]  */
10452     new_name = step_expr;
10453   else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10454     {
10455       /* When we're using loop_len produced by SELEC_VL, the non-final
10456          iterations are not always processing VF elements.  So vectorize
10457          induction variable instead of
10458
10459            _21 = vect_vec_iv_.6_22 + { VF, ... };
10460
10461          We should generate:
10462
10463            _35 = .SELECT_VL (ivtmp_33, VF);
10464            vect_cst__22 = [vec_duplicate_expr] _35;
10465            _21 = vect_vec_iv_.6_22 + vect_cst__22;  */
10466       gcc_assert (!slp_node);
10467       gimple_seq seq = NULL;
10468       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10469       tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 0);
10470       expr = force_gimple_operand (fold_convert (TREE_TYPE (step_expr),
10471                                                  unshare_expr (len)),
10472                                    &seq, true, NULL_TREE);
10473       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), expr,
10474                                step_expr);
10475       gsi_insert_seq_before (&si, seq, GSI_SAME_STMT);
10476       step_iv_si = &si;
10477     }
10478   else
10479     {
10480       /* iv_loop is the loop to be vectorized. Generate:
10481           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
10482       gimple_seq seq = NULL;
10483       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10484         {
10485           expr = build_int_cst (integer_type_node, vf);
10486           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10487         }
10488       else
10489         expr = build_int_cst (TREE_TYPE (step_expr), vf);
10490       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10491                                expr, step_expr);
10492       if (seq)
10493         {
10494           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10495           gcc_assert (!new_bb);
10496         }
10497     }
10498
10499   t = unshare_expr (new_name);
10500   gcc_assert (CONSTANT_CLASS_P (new_name)
10501               || TREE_CODE (new_name) == SSA_NAME);
10502   new_vec = build_vector_from_val (step_vectype, t);
10503   vec_step = vect_init_vector (loop_vinfo, stmt_info,
10504                                new_vec, step_vectype, step_iv_si);
10505
10506
10507   /* Create the following def-use cycle:
10508      loop prolog:
10509          vec_init = ...
10510          vec_step = ...
10511      loop:
10512          vec_iv = PHI <vec_init, vec_loop>
10513          ...
10514          STMT
10515          ...
10516          vec_loop = vec_iv + vec_step;  */
10517
10518   /* Create the induction-phi that defines the induction-operand.  */
10519   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10520   induction_phi = create_phi_node (vec_dest, iv_loop->header);
10521   induc_def = PHI_RESULT (induction_phi);
10522
10523   /* Create the iv update inside the loop  */
10524   stmts = NULL;
10525   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10526   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10527   vec_def = gimple_convert (&stmts, vectype, vec_def);
10528   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10529   new_stmt = SSA_NAME_DEF_STMT (vec_def);
10530
10531   /* Set the arguments of the phi node:  */
10532   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10533   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10534                UNKNOWN_LOCATION);
10535
10536   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10537   *vec_stmt = induction_phi;
10538
10539   /* In case that vectorization factor (VF) is bigger than the number
10540      of elements that we can fit in a vectype (nunits), we have to generate
10541      more than one vector stmt - i.e - we need to "unroll" the
10542      vector stmt by a factor VF/nunits.  For more details see documentation
10543      in vectorizable_operation.  */
10544
10545   if (ncopies > 1)
10546     {
10547       gimple_seq seq = NULL;
10548       /* FORNOW. This restriction should be relaxed.  */
10549       gcc_assert (!nested_in_vect_loop);
10550       /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1.  */
10551       gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10552
10553       /* Create the vector that holds the step of the induction.  */
10554       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10555         {
10556           expr = build_int_cst (integer_type_node, nunits);
10557           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10558         }
10559       else
10560         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10561       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10562                                expr, step_expr);
10563       if (seq)
10564         {
10565           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10566           gcc_assert (!new_bb);
10567         }
10568
10569       t = unshare_expr (new_name);
10570       gcc_assert (CONSTANT_CLASS_P (new_name)
10571                   || TREE_CODE (new_name) == SSA_NAME);
10572       new_vec = build_vector_from_val (step_vectype, t);
10573       vec_step = vect_init_vector (loop_vinfo, stmt_info,
10574                                    new_vec, step_vectype, NULL);
10575
10576       vec_def = induc_def;
10577       for (i = 1; i < ncopies + 1; i++)
10578         {
10579           /* vec_i = vec_prev + vec_step  */
10580           gimple_seq stmts = NULL;
10581           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10582           vec_def = gimple_build (&stmts,
10583                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
10584           vec_def = gimple_convert (&stmts, vectype, vec_def);
10585
10586           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10587           if (i < ncopies)
10588             {
10589               new_stmt = SSA_NAME_DEF_STMT (vec_def);
10590               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10591             }
10592           else
10593             {
10594               /* vec_1 = vec_iv + (VF/n * S)
10595                  vec_2 = vec_1 + (VF/n * S)
10596                  ...
10597                  vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10598
10599                  vec_n is used as vec_loop to save the large step register and
10600                  related operations.  */
10601               add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10602                            UNKNOWN_LOCATION);
10603             }
10604         }
10605     }
10606
10607   if (dump_enabled_p ())
10608     dump_printf_loc (MSG_NOTE, vect_location,
10609                      "transform induction: created def-use cycle: %G%G",
10610                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10611
10612   return true;
10613 }
10614
10615 /* Function vectorizable_live_operation_1.
10616
10617    helper function for vectorizable_live_operation.  */
10618
10619 static tree
10620 vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
10621                                stmt_vec_info stmt_info, basic_block exit_bb,
10622                                tree vectype, int ncopies, slp_tree slp_node,
10623                                tree bitsize, tree bitstart, tree vec_lhs,
10624                                tree lhs_type, gimple_stmt_iterator *exit_gsi)
10625 {
10626   gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10627
10628   tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10629   gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10630   for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10631     SET_PHI_ARG_DEF (phi, i, vec_lhs);
10632
10633   gimple_seq stmts = NULL;
10634   tree new_tree;
10635
10636   /* If bitstart is 0 then we can use a BIT_FIELD_REF  */
10637   if (integer_zerop (bitstart))
10638     {
10639       tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
10640                                       vec_lhs_phi, bitsize, bitstart);
10641
10642       /* Convert the extracted vector element to the scalar type.  */
10643       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10644     }
10645   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10646     {
10647       /* Emit:
10648
10649          SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10650
10651          where VEC_LHS is the vectorized live-out result and MASK is
10652          the loop mask for the final iteration.  */
10653       gcc_assert (ncopies == 1 && !slp_node);
10654       gimple_seq tem = NULL;
10655       gimple_stmt_iterator gsi = gsi_last (tem);
10656       tree len = vect_get_loop_len (loop_vinfo, &gsi,
10657                                     &LOOP_VINFO_LENS (loop_vinfo),
10658                                     1, vectype, 0, 0);
10659
10660       /* BIAS - 1.  */
10661       signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10662       tree bias_minus_one
10663         = int_const_binop (MINUS_EXPR,
10664                            build_int_cst (TREE_TYPE (len), biasval),
10665                            build_one_cst (TREE_TYPE (len)));
10666
10667       /* LAST_INDEX = LEN + (BIAS - 1).  */
10668       tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10669                                      len, bias_minus_one);
10670
10671       /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
10672       tree scalar_res
10673         = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10674                         vec_lhs_phi, last_index);
10675
10676       /* Convert the extracted vector element to the scalar type.  */
10677       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10678     }
10679   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10680     {
10681       /* Emit:
10682
10683          SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10684
10685          where VEC_LHS is the vectorized live-out result and MASK is
10686          the loop mask for the final iteration.  */
10687       gcc_assert (!slp_node);
10688       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10689       gimple_seq tem = NULL;
10690       gimple_stmt_iterator gsi = gsi_last (tem);
10691       tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10692                                       &LOOP_VINFO_MASKS (loop_vinfo),
10693                                       1, vectype, 0);
10694       tree scalar_res;
10695       gimple_seq_add_seq (&stmts, tem);
10696
10697       scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10698                                  mask, vec_lhs_phi);
10699
10700       /* Convert the extracted vector element to the scalar type.  */
10701       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10702     }
10703   else
10704     {
10705       tree bftype = TREE_TYPE (vectype);
10706       if (VECTOR_BOOLEAN_TYPE_P (vectype))
10707         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10708       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10709       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10710                                        &stmts, true, NULL_TREE);
10711     }
10712
10713   *exit_gsi = gsi_after_labels (exit_bb);
10714   if (stmts)
10715     gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10716
10717   return new_tree;
10718 }
10719
10720 /* Find the edge that's the final one in the path from SRC to DEST and
10721    return it.  This edge must exist in at most one forwarder edge between.  */
10722
10723 static edge
10724 find_connected_edge (edge src, basic_block dest)
10725 {
10726    if (src->dest == dest)
10727      return src;
10728
10729   return find_edge (src->dest, dest);
10730 }
10731
10732 /* Function vectorizable_live_operation.
10733
10734    STMT_INFO computes a value that is used outside the loop.  Check if
10735    it can be supported.  */
10736
10737 bool
10738 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10739                              slp_tree slp_node, slp_instance slp_node_instance,
10740                              int slp_index, bool vec_stmt_p,
10741                              stmt_vector_for_cost *cost_vec)
10742 {
10743   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10744   imm_use_iterator imm_iter;
10745   tree lhs, lhs_type, bitsize;
10746   tree vectype = (slp_node
10747                   ? SLP_TREE_VECTYPE (slp_node)
10748                   : STMT_VINFO_VECTYPE (stmt_info));
10749   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10750   int ncopies;
10751   gimple *use_stmt;
10752   use_operand_p use_p;
10753   auto_vec<tree> vec_oprnds;
10754   int vec_entry = 0;
10755   poly_uint64 vec_index = 0;
10756
10757   gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10758               || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10759
10760   /* If a stmt of a reduction is live, vectorize it via
10761      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
10762      validity so just trigger the transform here.  */
10763   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10764     {
10765       if (!vec_stmt_p)
10766         return true;
10767       if (slp_node)
10768         {
10769           /* For reduction chains the meta-info is attached to
10770              the group leader.  */
10771           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10772             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10773           /* For SLP reductions we vectorize the epilogue for
10774              all involved stmts together.  */
10775           else if (slp_index != 0)
10776             return true;
10777         }
10778       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10779       gcc_assert (reduc_info->is_reduc_info);
10780       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10781           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10782         return true;
10783
10784       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10785                                         slp_node_instance,
10786                                         LOOP_VINFO_IV_EXIT (loop_vinfo));
10787
10788       /* If early break we only have to materialize the reduction on the merge
10789          block, but we have to find an alternate exit first.  */
10790       if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10791         {
10792           for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10793             if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
10794               {
10795                 vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10796                                                   slp_node, slp_node_instance,
10797                                                   exit);
10798                 break;
10799               }
10800         }
10801
10802       return true;
10803     }
10804
10805   /* If STMT is not relevant and it is a simple assignment and its inputs are
10806      invariant then it can remain in place, unvectorized.  The original last
10807      scalar value that it computes will be used.  */
10808   if (!STMT_VINFO_RELEVANT_P (stmt_info))
10809     {
10810       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10811       if (dump_enabled_p ())
10812         dump_printf_loc (MSG_NOTE, vect_location,
10813                          "statement is simple and uses invariant.  Leaving in "
10814                          "place.\n");
10815       return true;
10816     }
10817
10818   if (slp_node)
10819     ncopies = 1;
10820   else
10821     ncopies = vect_get_num_copies (loop_vinfo, vectype);
10822
10823   if (slp_node)
10824     {
10825       gcc_assert (slp_index >= 0);
10826
10827       /* Get the last occurrence of the scalar index from the concatenation of
10828          all the slp vectors. Calculate which slp vector it is and the index
10829          within.  */
10830       int num_scalar = SLP_TREE_LANES (slp_node);
10831       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10832       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10833
10834       /* Calculate which vector contains the result, and which lane of
10835          that vector we need.  */
10836       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10837         {
10838           if (dump_enabled_p ())
10839             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10840                              "Cannot determine which vector holds the"
10841                              " final result.\n");
10842           return false;
10843         }
10844     }
10845
10846   if (!vec_stmt_p)
10847     {
10848       /* No transformation required.  */
10849       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10850         {
10851           if (slp_node)
10852             {
10853               if (dump_enabled_p ())
10854                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10855                                  "can't operate on partial vectors "
10856                                  "because an SLP statement is live after "
10857                                  "the loop.\n");
10858               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10859             }
10860           else if (ncopies > 1)
10861             {
10862               if (dump_enabled_p ())
10863                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10864                                  "can't operate on partial vectors "
10865                                  "because ncopies is greater than 1.\n");
10866               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10867             }
10868           else
10869             {
10870               gcc_assert (ncopies == 1 && !slp_node);
10871               if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10872                                                   OPTIMIZE_FOR_SPEED))
10873                 vect_record_loop_mask (loop_vinfo,
10874                                        &LOOP_VINFO_MASKS (loop_vinfo),
10875                                        1, vectype, NULL);
10876               else if (can_vec_extract_var_idx_p (
10877                          TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10878                 vect_record_loop_len (loop_vinfo,
10879                                       &LOOP_VINFO_LENS (loop_vinfo),
10880                                       1, vectype, 1);
10881               else
10882                 {
10883                   if (dump_enabled_p ())
10884                     dump_printf_loc (
10885                       MSG_MISSED_OPTIMIZATION, vect_location,
10886                       "can't operate on partial vectors "
10887                       "because the target doesn't support extract "
10888                       "last reduction.\n");
10889                   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10890                 }
10891             }
10892         }
10893       /* ???  Enable for loop costing as well.  */
10894       if (!loop_vinfo)
10895         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10896                           0, vect_epilogue);
10897       return true;
10898     }
10899
10900   /* Use the lhs of the original scalar statement.  */
10901   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10902   if (dump_enabled_p ())
10903     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10904                      "stmt %G", stmt);
10905
10906   lhs = gimple_get_lhs (stmt);
10907   lhs_type = TREE_TYPE (lhs);
10908
10909   bitsize = vector_element_bits_tree (vectype);
10910
10911   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
10912   tree vec_lhs, vec_lhs0, bitstart;
10913   gimple *vec_stmt, *vec_stmt0;
10914   if (slp_node)
10915     {
10916       gcc_assert (!loop_vinfo
10917                   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10918                       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10919
10920       /* Get the correct slp vectorized stmt.  */
10921       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10922       vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10923
10924       /* In case we need to early break vectorize also get the first stmt.  */
10925       vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10926       vec_stmt0 = SSA_NAME_DEF_STMT (vec_lhs0);
10927
10928       /* Get entry to use.  */
10929       bitstart = bitsize_int (vec_index);
10930       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10931     }
10932   else
10933     {
10934       /* For multiple copies, get the last copy.  */
10935       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10936       vec_lhs = gimple_get_lhs (vec_stmt);
10937
10938       /* In case we need to early break vectorize also get the first stmt.  */
10939       vec_stmt0 = STMT_VINFO_VEC_STMTS (stmt_info)[0];
10940       vec_lhs0 = gimple_get_lhs (vec_stmt0);
10941
10942       /* Get the last lane in the vector.  */
10943       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10944     }
10945
10946   if (loop_vinfo)
10947     {
10948       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10949          requirement, insert one phi node for it.  It looks like:
10950            loop;
10951          BB:
10952            # lhs' = PHI <lhs>
10953          ==>
10954            loop;
10955          BB:
10956            # vec_lhs' = PHI <vec_lhs>
10957            new_tree = lane_extract <vec_lhs', ...>;
10958            lhs' = new_tree;  */
10959
10960       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10961       /* Check if we have a loop where the chosen exit is not the main exit,
10962          in these cases for an early break we restart the iteration the vector code
10963          did.  For the live values we want the value at the start of the iteration
10964          rather than at the end.  */
10965       edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
10966       bool restart_loop = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10967       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10968         if (!is_gimple_debug (use_stmt)
10969             && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10970           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10971             {
10972               edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10973                                            phi_arg_index_from_use (use_p));
10974               bool main_exit_edge = e == main_e
10975                                     || find_connected_edge (main_e, e->src);
10976
10977               /* Early exits have an merge block, we want the merge block itself
10978                  so use ->src.  For main exit the merge block is the
10979                  destination.  */
10980               basic_block dest = main_exit_edge ? main_e->dest : e->src;
10981               tree tmp_vec_lhs = vec_lhs;
10982               tree tmp_bitstart = bitstart;
10983
10984               /* For early exit where the exit is not in the BB that leads
10985                  to the latch then we're restarting the iteration in the
10986                  scalar loop.  So get the first live value.  */
10987               restart_loop = restart_loop || !main_exit_edge;
10988               if (restart_loop
10989                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
10990                 {
10991                   tmp_vec_lhs = vec_lhs0;
10992                   tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10993                 }
10994
10995               gimple_stmt_iterator exit_gsi;
10996               tree new_tree
10997                 = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
10998                                                  dest, vectype, ncopies,
10999                                                  slp_node, bitsize,
11000                                                  tmp_bitstart, tmp_vec_lhs,
11001                                                  lhs_type, &exit_gsi);
11002
11003               if (gimple_phi_num_args (use_stmt) == 1)
11004                 {
11005                   auto gsi = gsi_for_stmt (use_stmt);
11006                   remove_phi_node (&gsi, false);
11007                   tree lhs_phi = gimple_phi_result (use_stmt);
11008                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
11009                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
11010                 }
11011               else
11012                 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, new_tree);
11013           }
11014
11015       /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
11016       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11017         gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
11018     }
11019   else
11020     {
11021       /* For basic-block vectorization simply insert the lane-extraction.  */
11022       tree bftype = TREE_TYPE (vectype);
11023       if (VECTOR_BOOLEAN_TYPE_P (vectype))
11024         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
11025       tree new_tree = build3 (BIT_FIELD_REF, bftype,
11026                               vec_lhs, bitsize, bitstart);
11027       gimple_seq stmts = NULL;
11028       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
11029                                        &stmts, true, NULL_TREE);
11030       if (TREE_CODE (new_tree) == SSA_NAME
11031           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
11032         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
11033       if (is_a <gphi *> (vec_stmt))
11034         {
11035           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
11036           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
11037         }
11038       else
11039         {
11040           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
11041           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
11042         }
11043
11044       /* Replace use of lhs with newly computed result.  If the use stmt is a
11045          single arg PHI, just replace all uses of PHI result.  It's necessary
11046          because lcssa PHI defining lhs may be before newly inserted stmt.  */
11047       use_operand_p use_p;
11048       stmt_vec_info use_stmt_info;
11049       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11050         if (!is_gimple_debug (use_stmt)
11051             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
11052                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
11053           {
11054             /* ???  This can happen when the live lane ends up being
11055                rooted in a vector construction code-generated by an
11056                external SLP node (and code-generation for that already
11057                happened).  See gcc.dg/vect/bb-slp-47.c.
11058                Doing this is what would happen if that vector CTOR
11059                were not code-generated yet so it is not too bad.
11060                ???  In fact we'd likely want to avoid this situation
11061                in the first place.  */
11062             if (TREE_CODE (new_tree) == SSA_NAME
11063                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11064                 && gimple_code (use_stmt) != GIMPLE_PHI
11065                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
11066                                                 use_stmt))
11067               {
11068                 if (dump_enabled_p ())
11069                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11070                                    "Using original scalar computation for "
11071                                    "live lane because use preceeds vector "
11072                                    "def\n");
11073                 continue;
11074               }
11075             /* ???  It can also happen that we end up pulling a def into
11076                a loop where replacing out-of-loop uses would require
11077                a new LC SSA PHI node.  Retain the original scalar in
11078                those cases as well.  PR98064.  */
11079             if (TREE_CODE (new_tree) == SSA_NAME
11080                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11081                 && (gimple_bb (use_stmt)->loop_father
11082                     != gimple_bb (vec_stmt)->loop_father)
11083                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
11084                                         gimple_bb (use_stmt)->loop_father))
11085               {
11086                 if (dump_enabled_p ())
11087                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11088                                    "Using original scalar computation for "
11089                                    "live lane because there is an out-of-loop "
11090                                    "definition for it\n");
11091                 continue;
11092               }
11093             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
11094               SET_USE (use_p, new_tree);
11095             update_stmt (use_stmt);
11096           }
11097     }
11098
11099   return true;
11100 }
11101
11102 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
11103
11104 static void
11105 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
11106 {
11107   ssa_op_iter op_iter;
11108   imm_use_iterator imm_iter;
11109   def_operand_p def_p;
11110   gimple *ustmt;
11111
11112   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
11113     {
11114       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
11115         {
11116           basic_block bb;
11117
11118           if (!is_gimple_debug (ustmt))
11119             continue;
11120
11121           bb = gimple_bb (ustmt);
11122
11123           if (!flow_bb_inside_loop_p (loop, bb))
11124             {
11125               if (gimple_debug_bind_p (ustmt))
11126                 {
11127                   if (dump_enabled_p ())
11128                     dump_printf_loc (MSG_NOTE, vect_location,
11129                                      "killing debug use\n");
11130
11131                   gimple_debug_bind_reset_value (ustmt);
11132                   update_stmt (ustmt);
11133                 }
11134               else
11135                 gcc_unreachable ();
11136             }
11137         }
11138     }
11139 }
11140
11141 /* Given loop represented by LOOP_VINFO, return true if computation of
11142    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
11143    otherwise.  */
11144
11145 static bool
11146 loop_niters_no_overflow (loop_vec_info loop_vinfo)
11147 {
11148   /* Constant case.  */
11149   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
11150     {
11151       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
11152       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
11153
11154       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
11155       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
11156       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
11157         return true;
11158     }
11159
11160   widest_int max;
11161   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11162   /* Check the upper bound of loop niters.  */
11163   if (get_max_loop_iterations (loop, &max))
11164     {
11165       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
11166       signop sgn = TYPE_SIGN (type);
11167       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
11168       if (max < type_max)
11169         return true;
11170     }
11171   return false;
11172 }
11173
11174 /* Return a mask type with half the number of elements as OLD_TYPE,
11175    given that it should have mode NEW_MODE.  */
11176
11177 tree
11178 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
11179 {
11180   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
11181   return build_truth_vector_type_for_mode (nunits, new_mode);
11182 }
11183
11184 /* Return a mask type with twice as many elements as OLD_TYPE,
11185    given that it should have mode NEW_MODE.  */
11186
11187 tree
11188 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
11189 {
11190   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
11191   return build_truth_vector_type_for_mode (nunits, new_mode);
11192 }
11193
11194 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
11195    contain a sequence of NVECTORS masks that each control a vector of type
11196    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
11197    these vector masks with the vector version of SCALAR_MASK.  */
11198
11199 void
11200 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
11201                        unsigned int nvectors, tree vectype, tree scalar_mask)
11202 {
11203   gcc_assert (nvectors != 0);
11204
11205   if (scalar_mask)
11206     {
11207       scalar_cond_masked_key cond (scalar_mask, nvectors);
11208       loop_vinfo->scalar_cond_masked_set.add (cond);
11209     }
11210
11211   masks->mask_set.add (std::make_pair (vectype, nvectors));
11212 }
11213
11214 /* Given a complete set of masks MASKS, extract mask number INDEX
11215    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11216    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
11217
11218    See the comment above vec_loop_masks for more details about the mask
11219    arrangement.  */
11220
11221 tree
11222 vect_get_loop_mask (loop_vec_info loop_vinfo,
11223                     gimple_stmt_iterator *gsi, vec_loop_masks *masks,
11224                     unsigned int nvectors, tree vectype, unsigned int index)
11225 {
11226   if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11227       == vect_partial_vectors_while_ult)
11228     {
11229       rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
11230       tree mask_type = rgm->type;
11231
11232       /* Populate the rgroup's mask array, if this is the first time we've
11233          used it.  */
11234       if (rgm->controls.is_empty ())
11235         {
11236           rgm->controls.safe_grow_cleared (nvectors, true);
11237           for (unsigned int i = 0; i < nvectors; ++i)
11238             {
11239               tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
11240               /* Provide a dummy definition until the real one is available.  */
11241               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11242               rgm->controls[i] = mask;
11243             }
11244         }
11245
11246       tree mask = rgm->controls[index];
11247       if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
11248                     TYPE_VECTOR_SUBPARTS (vectype)))
11249         {
11250           /* A loop mask for data type X can be reused for data type Y
11251              if X has N times more elements than Y and if Y's elements
11252              are N times bigger than X's.  In this case each sequence
11253              of N elements in the loop mask will be all-zero or all-one.
11254              We can then view-convert the mask so that each sequence of
11255              N elements is replaced by a single element.  */
11256           gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
11257                                   TYPE_VECTOR_SUBPARTS (vectype)));
11258           gimple_seq seq = NULL;
11259           mask_type = truth_type_for (vectype);
11260           mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
11261           if (seq)
11262             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11263         }
11264       return mask;
11265     }
11266   else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11267            == vect_partial_vectors_avx512)
11268     {
11269       /* The number of scalars per iteration and the number of vectors are
11270          both compile-time constants.  */
11271       unsigned int nscalars_per_iter
11272         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11273                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11274
11275       rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11276
11277       /* The stored nV is dependent on the mask type produced.  */
11278       gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11279                              TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11280                   == rgm->factor);
11281       nvectors = rgm->factor;
11282
11283       /* Populate the rgroup's mask array, if this is the first time we've
11284          used it.  */
11285       if (rgm->controls.is_empty ())
11286         {
11287           rgm->controls.safe_grow_cleared (nvectors, true);
11288           for (unsigned int i = 0; i < nvectors; ++i)
11289             {
11290               tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11291               /* Provide a dummy definition until the real one is available.  */
11292               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11293               rgm->controls[i] = mask;
11294             }
11295         }
11296       if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11297                     TYPE_VECTOR_SUBPARTS (vectype)))
11298         return rgm->controls[index];
11299
11300       /* Split the vector if needed.  Since we are dealing with integer mode
11301          masks with AVX512 we can operate on the integer representation
11302          performing the whole vector shifting.  */
11303       unsigned HOST_WIDE_INT factor;
11304       bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11305                                      TYPE_VECTOR_SUBPARTS (vectype), &factor);
11306       gcc_assert (ok);
11307       gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11308       tree mask_type = truth_type_for (vectype);
11309       gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11310       unsigned vi = index / factor;
11311       unsigned vpart = index % factor;
11312       tree vec = rgm->controls[vi];
11313       gimple_seq seq = NULL;
11314       vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11315                           lang_hooks.types.type_for_mode
11316                                 (TYPE_MODE (rgm->type), 1), vec);
11317       /* For integer mode masks simply shift the right bits into position.  */
11318       if (vpart != 0)
11319         vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11320                             build_int_cst (integer_type_node,
11321                                            (TYPE_VECTOR_SUBPARTS (vectype)
11322                                             * vpart)));
11323       vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11324                                     (TYPE_MODE (mask_type), 1), vec);
11325       vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11326       if (seq)
11327         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11328       return vec;
11329     }
11330   else
11331     gcc_unreachable ();
11332 }
11333
11334 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11335    lengths for controlling an operation on VECTYPE.  The operation splits
11336    each element of VECTYPE into FACTOR separate subelements, measuring the
11337    length as a number of these subelements.  */
11338
11339 void
11340 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11341                       unsigned int nvectors, tree vectype, unsigned int factor)
11342 {
11343   gcc_assert (nvectors != 0);
11344   if (lens->length () < nvectors)
11345     lens->safe_grow_cleared (nvectors, true);
11346   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11347
11348   /* The number of scalars per iteration, scalar occupied bytes and
11349      the number of vectors are both compile-time constants.  */
11350   unsigned int nscalars_per_iter
11351     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11352                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11353
11354   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11355     {
11356       /* For now, we only support cases in which all loads and stores fall back
11357          to VnQI or none do.  */
11358       gcc_assert (!rgl->max_nscalars_per_iter
11359                   || (rgl->factor == 1 && factor == 1)
11360                   || (rgl->max_nscalars_per_iter * rgl->factor
11361                       == nscalars_per_iter * factor));
11362       rgl->max_nscalars_per_iter = nscalars_per_iter;
11363       rgl->type = vectype;
11364       rgl->factor = factor;
11365     }
11366 }
11367
11368 /* Given a complete set of lengths LENS, extract length number INDEX
11369    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11370    where 0 <= INDEX < NVECTORS.  Return a value that contains FACTOR
11371    multipled by the number of elements that should be processed.
11372    Insert any set-up statements before GSI.  */
11373
11374 tree
11375 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11376                    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11377                    unsigned int index, unsigned int factor)
11378 {
11379   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11380   bool use_bias_adjusted_len =
11381     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11382
11383   /* Populate the rgroup's len array, if this is the first time we've
11384      used it.  */
11385   if (rgl->controls.is_empty ())
11386     {
11387       rgl->controls.safe_grow_cleared (nvectors, true);
11388       for (unsigned int i = 0; i < nvectors; ++i)
11389         {
11390           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11391           gcc_assert (len_type != NULL_TREE);
11392
11393           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11394
11395           /* Provide a dummy definition until the real one is available.  */
11396           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11397           rgl->controls[i] = len;
11398
11399           if (use_bias_adjusted_len)
11400             {
11401               gcc_assert (i == 0);
11402               tree adjusted_len =
11403                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11404               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11405               rgl->bias_adjusted_ctrl = adjusted_len;
11406             }
11407         }
11408     }
11409
11410   if (use_bias_adjusted_len)
11411     return rgl->bias_adjusted_ctrl;
11412
11413   tree loop_len = rgl->controls[index];
11414   if (rgl->factor == 1 && factor == 1)
11415     {
11416       poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11417       poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11418       if (maybe_ne (nunits1, nunits2))
11419         {
11420           /* A loop len for data type X can be reused for data type Y
11421              if X has N times more elements than Y and if Y's elements
11422              are N times bigger than X's.  */
11423           gcc_assert (multiple_p (nunits1, nunits2));
11424           factor = exact_div (nunits1, nunits2).to_constant ();
11425           tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11426           gimple_seq seq = NULL;
11427           loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11428                                    build_int_cst (iv_type, factor));
11429           if (seq)
11430             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11431         }
11432     }
11433   return loop_len;
11434 }
11435
11436 /* Scale profiling counters by estimation for LOOP which is vectorized
11437    by factor VF.
11438    If FLAT is true, the loop we started with had unrealistically flat
11439    profile.  */
11440
11441 static void
11442 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11443 {
11444   /* For flat profiles do not scale down proportionally by VF and only
11445      cap by known iteration count bounds.  */
11446   if (flat)
11447     {
11448       if (dump_file && (dump_flags & TDF_DETAILS))
11449         fprintf (dump_file,
11450                  "Vectorized loop profile seems flat; not scaling iteration "
11451                  "count down by the vectorization factor %i\n", vf);
11452       scale_loop_profile (loop, profile_probability::always (),
11453                           get_likely_max_loop_iterations_int (loop));
11454       return;
11455     }
11456   /* Loop body executes VF fewer times and exit increases VF times.  */
11457   profile_count entry_count = loop_preheader_edge (loop)->count ();
11458
11459   /* If we have unreliable loop profile avoid dropping entry
11460      count bellow header count.  This can happen since loops
11461      has unrealistically low trip counts.  */
11462   while (vf > 1
11463          && loop->header->count > entry_count
11464          && loop->header->count < entry_count * vf)
11465     {
11466       if (dump_file && (dump_flags & TDF_DETAILS))
11467         fprintf (dump_file,
11468                  "Vectorization factor %i seems too large for profile "
11469                  "prevoiusly believed to be consistent; reducing.\n", vf);
11470       vf /= 2;
11471     }
11472
11473   if (entry_count.nonzero_p ())
11474     set_edge_probability_and_rescale_others
11475             (exit_e,
11476              entry_count.probability_in (loop->header->count / vf));
11477   /* Avoid producing very large exit probability when we do not have
11478      sensible profile.  */
11479   else if (exit_e->probability < profile_probability::always () / (vf * 2))
11480     set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11481   loop->latch->count = single_pred_edge (loop->latch)->count ();
11482
11483   scale_loop_profile (loop, profile_probability::always () / vf,
11484                       get_likely_max_loop_iterations_int (loop));
11485 }
11486
11487 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11488    latch edge values originally defined by it.  */
11489
11490 static void
11491 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11492                                      stmt_vec_info def_stmt_info)
11493 {
11494   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11495   if (!def || TREE_CODE (def) != SSA_NAME)
11496     return;
11497   stmt_vec_info phi_info;
11498   imm_use_iterator iter;
11499   use_operand_p use_p;
11500   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11501     {
11502       gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11503       if (!phi)
11504         continue;
11505       if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11506             && (phi_info = loop_vinfo->lookup_stmt (phi))
11507             && STMT_VINFO_RELEVANT_P (phi_info)))
11508         continue;
11509       loop_p loop = gimple_bb (phi)->loop_father;
11510       edge e = loop_latch_edge (loop);
11511       if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11512         continue;
11513
11514       if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11515           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11516           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11517         {
11518           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11519           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11520           gcc_assert (phi_defs.length () == latch_defs.length ());
11521           for (unsigned i = 0; i < phi_defs.length (); ++i)
11522             add_phi_arg (as_a <gphi *> (phi_defs[i]),
11523                          gimple_get_lhs (latch_defs[i]), e,
11524                          gimple_phi_arg_location (phi, e->dest_idx));
11525         }
11526       else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11527         {
11528           /* For first order recurrences we have to update both uses of
11529              the latch definition, the one in the PHI node and the one
11530              in the generated VEC_PERM_EXPR.  */
11531           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11532           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11533           gcc_assert (phi_defs.length () == latch_defs.length ());
11534           tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11535           gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11536           for (unsigned i = 0; i < phi_defs.length (); ++i)
11537             {
11538               gassign *perm = as_a <gassign *> (phi_defs[i]);
11539               if (i > 0)
11540                 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11541               gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11542               update_stmt (perm);
11543             }
11544           add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11545                        gimple_phi_arg_location (phi, e->dest_idx));
11546         }
11547     }
11548 }
11549
11550 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11551    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11552    stmt_vec_info.  */
11553
11554 static bool
11555 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11556                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11557 {
11558   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11559   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11560
11561   if (dump_enabled_p ())
11562     dump_printf_loc (MSG_NOTE, vect_location,
11563                      "------>vectorizing statement: %G", stmt_info->stmt);
11564
11565   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11566     vect_loop_kill_debug_uses (loop, stmt_info);
11567
11568   if (!STMT_VINFO_RELEVANT_P (stmt_info)
11569       && !STMT_VINFO_LIVE_P (stmt_info))
11570     {
11571       if (is_gimple_call (stmt_info->stmt)
11572           && gimple_call_internal_p (stmt_info->stmt, IFN_MASK_CALL))
11573         {
11574           gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11575           *seen_store = stmt_info;
11576           return false;
11577         }
11578       return false;
11579     }
11580
11581   if (STMT_VINFO_VECTYPE (stmt_info))
11582     {
11583       poly_uint64 nunits
11584         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11585       if (!STMT_SLP_TYPE (stmt_info)
11586           && maybe_ne (nunits, vf)
11587           && dump_enabled_p ())
11588         /* For SLP VF is set according to unrolling factor, and not
11589            to vector size, hence for SLP this print is not valid.  */
11590         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11591     }
11592
11593   /* Pure SLP statements have already been vectorized.  We still need
11594      to apply loop vectorization to hybrid SLP statements.  */
11595   if (PURE_SLP_STMT (stmt_info))
11596     return false;
11597
11598   if (dump_enabled_p ())
11599     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11600
11601   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11602     *seen_store = stmt_info;
11603
11604   return true;
11605 }
11606
11607 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11608    in the hash_map with its corresponding values.  */
11609
11610 static tree
11611 find_in_mapping (tree t, void *context)
11612 {
11613   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11614
11615   tree *value = mapping->get (t);
11616   return value ? *value : t;
11617 }
11618
11619 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
11620    original loop that has now been vectorized.
11621
11622    The inits of the data_references need to be advanced with the number of
11623    iterations of the main loop.  This has been computed in vect_do_peeling and
11624    is stored in parameter ADVANCE.  We first restore the data_references
11625    initial offset with the values recored in ORIG_DRS_INIT.
11626
11627    Since the loop_vec_info of this EPILOGUE was constructed for the original
11628    loop, its stmt_vec_infos all point to the original statements.  These need
11629    to be updated to point to their corresponding copies as well as the SSA_NAMES
11630    in their PATTERN_DEF_SEQs and RELATED_STMTs.
11631
11632    The data_reference's connections also need to be updated.  Their
11633    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11634    stmt_vec_infos, their statements need to point to their corresponding copy,
11635    if they are gather loads or scatter stores then their reference needs to be
11636    updated to point to its corresponding copy and finally we set
11637    'base_misaligned' to false as we have already peeled for alignment in the
11638    prologue of the main loop.  */
11639
11640 static void
11641 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11642 {
11643   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11644   auto_vec<gimple *> stmt_worklist;
11645   hash_map<tree,tree> mapping;
11646   gimple *orig_stmt, *new_stmt;
11647   gimple_stmt_iterator epilogue_gsi;
11648   gphi_iterator epilogue_phi_gsi;
11649   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11650   basic_block *epilogue_bbs = get_loop_body (epilogue);
11651   unsigned i;
11652
11653   free (LOOP_VINFO_BBS (epilogue_vinfo));
11654   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11655
11656   /* Advance data_reference's with the number of iterations of the previous
11657      loop and its prologue.  */
11658   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11659
11660
11661   /* The EPILOGUE loop is a copy of the original loop so they share the same
11662      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
11663      point to the copied statements.  We also create a mapping of all LHS' in
11664      the original loop and all the LHS' in the EPILOGUE and create worklists to
11665      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
11666   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11667     {
11668       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11669            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11670         {
11671           new_stmt = epilogue_phi_gsi.phi ();
11672
11673           gcc_assert (gimple_uid (new_stmt) > 0);
11674           stmt_vinfo
11675             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11676
11677           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11678           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11679
11680           mapping.put (gimple_phi_result (orig_stmt),
11681                        gimple_phi_result (new_stmt));
11682           /* PHI nodes can not have patterns or related statements.  */
11683           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11684                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11685         }
11686
11687       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11688            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11689         {
11690           new_stmt = gsi_stmt (epilogue_gsi);
11691           if (is_gimple_debug (new_stmt))
11692             continue;
11693
11694           gcc_assert (gimple_uid (new_stmt) > 0);
11695           stmt_vinfo
11696             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11697
11698           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11699           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11700
11701           if (tree old_lhs = gimple_get_lhs (orig_stmt))
11702             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11703
11704           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11705             {
11706               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11707               for (gimple_stmt_iterator gsi = gsi_start (seq);
11708                    !gsi_end_p (gsi); gsi_next (&gsi))
11709                 stmt_worklist.safe_push (gsi_stmt (gsi));
11710             }
11711
11712           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11713           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11714             {
11715               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11716               stmt_worklist.safe_push (stmt);
11717               /* Set BB such that the assert in
11718                 'get_initial_def_for_reduction' is able to determine that
11719                 the BB of the related stmt is inside this loop.  */
11720               gimple_set_bb (stmt,
11721                              gimple_bb (new_stmt));
11722               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11723               gcc_assert (related_vinfo == NULL
11724                           || related_vinfo == stmt_vinfo);
11725             }
11726         }
11727     }
11728
11729   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11730      using the original main loop and thus need to be updated to refer to the
11731      cloned variables used in the epilogue.  */
11732   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11733     {
11734       gimple *stmt = stmt_worklist[i];
11735       tree *new_op;
11736
11737       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11738         {
11739           tree op = gimple_op (stmt, j);
11740           if ((new_op = mapping.get(op)))
11741             gimple_set_op (stmt, j, *new_op);
11742           else
11743             {
11744               /* PR92429: The last argument of simplify_replace_tree disables
11745                  folding when replacing arguments.  This is required as
11746                  otherwise you might end up with different statements than the
11747                  ones analyzed in vect_loop_analyze, leading to different
11748                  vectorization.  */
11749               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11750                                           &find_in_mapping, &mapping, false);
11751               gimple_set_op (stmt, j, op);
11752             }
11753         }
11754     }
11755
11756   struct data_reference *dr;
11757   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11758   FOR_EACH_VEC_ELT (datarefs, i, dr)
11759     {
11760       orig_stmt = DR_STMT (dr);
11761       gcc_assert (gimple_uid (orig_stmt) > 0);
11762       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11763       /* Data references for gather loads and scatter stores do not use the
11764          updated offset we set using ADVANCE.  Instead we have to make sure the
11765          reference in the data references point to the corresponding copy of
11766          the original in the epilogue.  Make sure to update both
11767          gather/scatters recognized by dataref analysis and also other
11768          refs that get_load_store_type classified as VMAT_GATHER_SCATTER.  */
11769       auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11770       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11771           || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11772         {
11773           DR_REF (dr)
11774             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11775                                      &find_in_mapping, &mapping);
11776           DR_BASE_ADDRESS (dr)
11777             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11778                                      &find_in_mapping, &mapping);
11779         }
11780       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11781       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11782       /* The vector size of the epilogue is smaller than that of the main loop
11783          so the alignment is either the same or lower. This means the dr will
11784          thus by definition be aligned.  */
11785       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11786     }
11787
11788   epilogue_vinfo->shared->datarefs_copy.release ();
11789   epilogue_vinfo->shared->save_datarefs ();
11790 }
11791
11792 /*  When vectorizing early break statements instructions that happen before
11793     the early break in the current BB need to be moved to after the early
11794     break.  This function deals with that and assumes that any validity
11795     checks has already been performed.
11796
11797     While moving the instructions if it encounters a VUSE or VDEF it then
11798     corrects the VUSES as it moves the statements along.  GDEST is the location
11799     in which to insert the new statements.  */
11800
11801 static void
11802 move_early_exit_stmts (loop_vec_info loop_vinfo)
11803 {
11804   DUMP_VECT_SCOPE ("move_early_exit_stmts");
11805
11806   if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
11807     return;
11808
11809   /* Move all stmts that need moving.  */
11810   basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
11811   gimple_stmt_iterator dest_gsi = gsi_start_bb (dest_bb);
11812
11813   for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
11814     {
11815       /* Check to see if statement is still required for vect or has been
11816          elided.  */
11817       auto stmt_info = loop_vinfo->lookup_stmt (stmt);
11818       if (!stmt_info)
11819         continue;
11820
11821       if (dump_enabled_p ())
11822         dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
11823
11824       gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
11825       gsi_move_before (&stmt_gsi, &dest_gsi);
11826       gsi_prev (&dest_gsi);
11827     }
11828
11829   /* Update all the stmts with their new reaching VUSES.  */
11830   tree vuse
11831     = gimple_vuse (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).last ());
11832   for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
11833     {
11834       if (dump_enabled_p ())
11835           dump_printf_loc (MSG_NOTE, vect_location,
11836                            "updating vuse to %T for load %G", vuse, p);
11837       gimple_set_vuse (p, vuse);
11838       update_stmt (p);
11839     }
11840 }
11841
11842 /* Function vect_transform_loop.
11843
11844    The analysis phase has determined that the loop is vectorizable.
11845    Vectorize the loop - created vectorized stmts to replace the scalar
11846    stmts in the loop, and update the loop exit condition.
11847    Returns scalar epilogue loop if any.  */
11848
11849 class loop *
11850 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11851 {
11852   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11853   class loop *epilogue = NULL;
11854   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11855   int nbbs = loop->num_nodes;
11856   int i;
11857   tree niters_vector = NULL_TREE;
11858   tree step_vector = NULL_TREE;
11859   tree niters_vector_mult_vf = NULL_TREE;
11860   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11861   unsigned int lowest_vf = constant_lower_bound (vf);
11862   gimple *stmt;
11863   bool check_profitability = false;
11864   unsigned int th;
11865   bool flat = maybe_flat_loop_profile (loop);
11866
11867   DUMP_VECT_SCOPE ("vec_transform_loop");
11868
11869   loop_vinfo->shared->check_datarefs ();
11870
11871   /* Use the more conservative vectorization threshold.  If the number
11872      of iterations is constant assume the cost check has been performed
11873      by our caller.  If the threshold makes all loops profitable that
11874      run at least the (estimated) vectorization factor number of times
11875      checking is pointless, too.  */
11876   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11877   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11878     {
11879       if (dump_enabled_p ())
11880         dump_printf_loc (MSG_NOTE, vect_location,
11881                          "Profitability threshold is %d loop iterations.\n",
11882                          th);
11883       check_profitability = true;
11884     }
11885
11886   /* Make sure there exists a single-predecessor exit bb.  Do this before
11887      versioning.   */
11888   edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11889   if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11890     {
11891       split_loop_exit_edge (e, true);
11892       if (dump_enabled_p ())
11893         dump_printf (MSG_NOTE, "split exit edge\n");
11894     }
11895
11896   /* Version the loop first, if required, so the profitability check
11897      comes first.  */
11898
11899   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11900     {
11901       class loop *sloop
11902         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11903       sloop->force_vectorize = false;
11904       check_profitability = false;
11905     }
11906
11907   /* Make sure there exists a single-predecessor exit bb also on the
11908      scalar loop copy.  Do this after versioning but before peeling
11909      so CFG structure is fine for both scalar and if-converted loop
11910      to make slpeel_duplicate_current_defs_from_edges face matched
11911      loop closed PHI nodes on the exit.  */
11912   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11913     {
11914       e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11915       if (! single_pred_p (e->dest))
11916         {
11917           split_loop_exit_edge (e, true);
11918           if (dump_enabled_p ())
11919             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11920         }
11921     }
11922
11923   tree niters = vect_build_loop_niters (loop_vinfo);
11924   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11925   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11926   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11927   tree advance;
11928   drs_init_vec orig_drs_init;
11929
11930   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11931                               &step_vector, &niters_vector_mult_vf, th,
11932                               check_profitability, niters_no_overflow,
11933                               &advance);
11934   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11935       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11936     {
11937       /* Ifcvt duplicates loop preheader, loop body and produces an basic
11938          block after loop exit.  We need to scale all that.  */
11939       basic_block preheader
11940         = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11941       preheader->count
11942         = preheader->count.apply_probability
11943               (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11944       scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11945                               LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11946       LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo)->dest->count = preheader->count;
11947     }
11948
11949   if (niters_vector == NULL_TREE)
11950     {
11951       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11952           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11953           && known_eq (lowest_vf, vf))
11954         {
11955           niters_vector
11956             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11957                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11958           step_vector = build_one_cst (TREE_TYPE (niters));
11959         }
11960       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11961         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11962                                      &step_vector, niters_no_overflow);
11963       else
11964         /* vect_do_peeling subtracted the number of peeled prologue
11965            iterations from LOOP_VINFO_NITERS.  */
11966         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11967                                      &niters_vector, &step_vector,
11968                                      niters_no_overflow);
11969     }
11970
11971   /* 1) Make sure the loop header has exactly two entries
11972      2) Make sure we have a preheader basic block.  */
11973
11974   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11975
11976   split_edge (loop_preheader_edge (loop));
11977
11978   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11979     /* This will deal with any possible peeling.  */
11980     vect_prepare_for_masked_peels (loop_vinfo);
11981
11982   /* Handle any code motion that we need to for early-break vectorization after
11983      we've done peeling but just before we start vectorizing.  */
11984   if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11985     move_early_exit_stmts (loop_vinfo);
11986
11987   /* Schedule the SLP instances first, then handle loop vectorization
11988      below.  */
11989   if (!loop_vinfo->slp_instances.is_empty ())
11990     {
11991       DUMP_VECT_SCOPE ("scheduling SLP instances");
11992       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11993     }
11994
11995   /* FORNOW: the vectorizer supports only loops which body consist
11996      of one basic block (header + empty latch). When the vectorizer will
11997      support more involved loop forms, the order by which the BBs are
11998      traversed need to be reconsidered.  */
11999
12000   for (i = 0; i < nbbs; i++)
12001     {
12002       basic_block bb = bbs[i];
12003       stmt_vec_info stmt_info;
12004
12005       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
12006            gsi_next (&si))
12007         {
12008           gphi *phi = si.phi ();
12009           if (dump_enabled_p ())
12010             dump_printf_loc (MSG_NOTE, vect_location,
12011                              "------>vectorizing phi: %G", (gimple *) phi);
12012           stmt_info = loop_vinfo->lookup_stmt (phi);
12013           if (!stmt_info)
12014             continue;
12015
12016           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
12017             vect_loop_kill_debug_uses (loop, stmt_info);
12018
12019           if (!STMT_VINFO_RELEVANT_P (stmt_info)
12020               && !STMT_VINFO_LIVE_P (stmt_info))
12021             continue;
12022
12023           if (STMT_VINFO_VECTYPE (stmt_info)
12024               && (maybe_ne
12025                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
12026               && dump_enabled_p ())
12027             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
12028
12029           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12030                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12031                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12032                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12033                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
12034                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
12035               && ! PURE_SLP_STMT (stmt_info))
12036             {
12037               if (dump_enabled_p ())
12038                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
12039               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
12040             }
12041         }
12042
12043       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
12044            gsi_next (&si))
12045         {
12046           gphi *phi = si.phi ();
12047           stmt_info = loop_vinfo->lookup_stmt (phi);
12048           if (!stmt_info)
12049             continue;
12050
12051           if (!STMT_VINFO_RELEVANT_P (stmt_info)
12052               && !STMT_VINFO_LIVE_P (stmt_info))
12053             continue;
12054
12055           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12056                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12057                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12058                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12059                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
12060                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
12061               && ! PURE_SLP_STMT (stmt_info))
12062             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
12063         }
12064
12065       for (gimple_stmt_iterator si = gsi_start_bb (bb);
12066            !gsi_end_p (si);)
12067         {
12068           stmt = gsi_stmt (si);
12069           /* During vectorization remove existing clobber stmts.  */
12070           if (gimple_clobber_p (stmt))
12071             {
12072               unlink_stmt_vdef (stmt);
12073               gsi_remove (&si, true);
12074               release_defs (stmt);
12075             }
12076           else
12077             {
12078               /* Ignore vector stmts created in the outer loop.  */
12079               stmt_info = loop_vinfo->lookup_stmt (stmt);
12080
12081               /* vector stmts created in the outer-loop during vectorization of
12082                  stmts in an inner-loop may not have a stmt_info, and do not
12083                  need to be vectorized.  */
12084               stmt_vec_info seen_store = NULL;
12085               if (stmt_info)
12086                 {
12087                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
12088                     {
12089                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
12090                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
12091                            !gsi_end_p (subsi); gsi_next (&subsi))
12092                         {
12093                           stmt_vec_info pat_stmt_info
12094                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
12095                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12096                                                     &si, &seen_store);
12097                         }
12098                       stmt_vec_info pat_stmt_info
12099                         = STMT_VINFO_RELATED_STMT (stmt_info);
12100                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12101                                                     &si, &seen_store))
12102                         maybe_set_vectorized_backedge_value (loop_vinfo,
12103                                                              pat_stmt_info);
12104                     }
12105                   else
12106                     {
12107                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
12108                                                     &seen_store))
12109                         maybe_set_vectorized_backedge_value (loop_vinfo,
12110                                                              stmt_info);
12111                     }
12112                 }
12113               gsi_next (&si);
12114               if (seen_store)
12115                 {
12116                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
12117                     /* Interleaving.  If IS_STORE is TRUE, the
12118                        vectorization of the interleaving chain was
12119                        completed - free all the stores in the chain.  */
12120                     vect_remove_stores (loop_vinfo,
12121                                         DR_GROUP_FIRST_ELEMENT (seen_store));
12122                   else
12123                     /* Free the attached stmt_vec_info and remove the stmt.  */
12124                     loop_vinfo->remove_stmt (stmt_info);
12125                 }
12126             }
12127         }
12128
12129       /* Stub out scalar statements that must not survive vectorization.
12130          Doing this here helps with grouped statements, or statements that
12131          are involved in patterns.  */
12132       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
12133            !gsi_end_p (gsi); gsi_next (&gsi))
12134         {
12135           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
12136           if (!call || !gimple_call_internal_p (call))
12137             continue;
12138           internal_fn ifn = gimple_call_internal_fn (call);
12139           if (ifn == IFN_MASK_LOAD)
12140             {
12141               tree lhs = gimple_get_lhs (call);
12142               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12143                 {
12144                   tree zero = build_zero_cst (TREE_TYPE (lhs));
12145                   gimple *new_stmt = gimple_build_assign (lhs, zero);
12146                   gsi_replace (&gsi, new_stmt, true);
12147                 }
12148             }
12149           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
12150             {
12151               tree lhs = gimple_get_lhs (call);
12152               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12153                 {
12154                   tree else_arg
12155                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
12156                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
12157                   gsi_replace (&gsi, new_stmt, true);
12158                 }
12159             }
12160         }
12161     }                           /* BBs in loop */
12162
12163   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
12164      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
12165   if (integer_onep (step_vector))
12166     niters_no_overflow = true;
12167   vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
12168                            niters_vector, step_vector, niters_vector_mult_vf,
12169                            !niters_no_overflow);
12170
12171   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
12172
12173   /* True if the final iteration might not handle a full vector's
12174      worth of scalar iterations.  */
12175   bool final_iter_may_be_partial
12176     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
12177   /* The minimum number of iterations performed by the epilogue.  This
12178      is 1 when peeling for gaps because we always need a final scalar
12179      iteration.  */
12180   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
12181   /* +1 to convert latch counts to loop iteration counts,
12182      -min_epilogue_iters to remove iterations that cannot be performed
12183        by the vector code.  */
12184   int bias_for_lowest = 1 - min_epilogue_iters;
12185   int bias_for_assumed = bias_for_lowest;
12186   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
12187   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
12188     {
12189       /* When the amount of peeling is known at compile time, the first
12190          iteration will have exactly alignment_npeels active elements.
12191          In the worst case it will have at least one.  */
12192       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
12193       bias_for_lowest += lowest_vf - min_first_active;
12194       bias_for_assumed += assumed_vf - min_first_active;
12195     }
12196   /* In these calculations the "- 1" converts loop iteration counts
12197      back to latch counts.  */
12198   if (loop->any_upper_bound)
12199     {
12200       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
12201       loop->nb_iterations_upper_bound
12202         = (final_iter_may_be_partial
12203            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
12204                             lowest_vf) - 1
12205            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
12206                              lowest_vf) - 1);
12207       if (main_vinfo
12208           /* Both peeling for alignment and peeling for gaps can end up
12209              with the scalar epilogue running for more than VF-1 iterations.  */
12210           && !main_vinfo->peeling_for_alignment
12211           && !main_vinfo->peeling_for_gaps)
12212         {
12213           unsigned int bound;
12214           poly_uint64 main_iters
12215             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
12216                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
12217           main_iters
12218             = upper_bound (main_iters,
12219                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
12220           if (can_div_away_from_zero_p (main_iters,
12221                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
12222                                         &bound))
12223             loop->nb_iterations_upper_bound
12224               = wi::umin ((bound_wide_int) (bound - 1),
12225                           loop->nb_iterations_upper_bound);
12226       }
12227   }
12228   if (loop->any_likely_upper_bound)
12229     loop->nb_iterations_likely_upper_bound
12230       = (final_iter_may_be_partial
12231          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
12232                           + bias_for_lowest, lowest_vf) - 1
12233          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
12234                            + bias_for_lowest, lowest_vf) - 1);
12235   if (loop->any_estimate)
12236     loop->nb_iterations_estimate
12237       = (final_iter_may_be_partial
12238          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
12239                           assumed_vf) - 1
12240          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
12241                            assumed_vf) - 1);
12242   scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
12243                                assumed_vf, flat);
12244
12245   if (dump_enabled_p ())
12246     {
12247       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
12248         {
12249           dump_printf_loc (MSG_NOTE, vect_location,
12250                            "LOOP VECTORIZED\n");
12251           if (loop->inner)
12252             dump_printf_loc (MSG_NOTE, vect_location,
12253                              "OUTER LOOP VECTORIZED\n");
12254           dump_printf (MSG_NOTE, "\n");
12255         }
12256       else
12257         dump_printf_loc (MSG_NOTE, vect_location,
12258                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12259                          GET_MODE_NAME (loop_vinfo->vector_mode));
12260     }
12261
12262   /* Loops vectorized with a variable factor won't benefit from
12263      unrolling/peeling.  */
12264   if (!vf.is_constant ())
12265     {
12266       loop->unroll = 1;
12267       if (dump_enabled_p ())
12268         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
12269                          " variable-length vectorization factor\n");
12270     }
12271   /* Free SLP instances here because otherwise stmt reference counting
12272      won't work.  */
12273   slp_instance instance;
12274   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
12275     vect_free_slp_instance (instance);
12276   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
12277   /* Clear-up safelen field since its value is invalid after vectorization
12278      since vectorized loop can have loop-carried dependencies.  */
12279   loop->safelen = 0;
12280
12281   if (epilogue)
12282     {
12283       update_epilogue_loop_vinfo (epilogue, advance);
12284
12285       epilogue->simduid = loop->simduid;
12286       epilogue->force_vectorize = loop->force_vectorize;
12287       epilogue->dont_vectorize = false;
12288     }
12289
12290   return epilogue;
12291 }
12292
12293 /* The code below is trying to perform simple optimization - revert
12294    if-conversion for masked stores, i.e. if the mask of a store is zero
12295    do not perform it and all stored value producers also if possible.
12296    For example,
12297      for (i=0; i<n; i++)
12298        if (c[i])
12299         {
12300           p1[i] += 1;
12301           p2[i] = p3[i] +2;
12302         }
12303    this transformation will produce the following semi-hammock:
12304
12305    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12306      {
12307        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12308        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12309        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12310        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12311        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12312        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12313      }
12314 */
12315
12316 void
12317 optimize_mask_stores (class loop *loop)
12318 {
12319   basic_block *bbs = get_loop_body (loop);
12320   unsigned nbbs = loop->num_nodes;
12321   unsigned i;
12322   basic_block bb;
12323   class loop *bb_loop;
12324   gimple_stmt_iterator gsi;
12325   gimple *stmt;
12326   auto_vec<gimple *> worklist;
12327   auto_purge_vect_location sentinel;
12328
12329   vect_location = find_loop_location (loop);
12330   /* Pick up all masked stores in loop if any.  */
12331   for (i = 0; i < nbbs; i++)
12332     {
12333       bb = bbs[i];
12334       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12335            gsi_next (&gsi))
12336         {
12337           stmt = gsi_stmt (gsi);
12338           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12339             worklist.safe_push (stmt);
12340         }
12341     }
12342
12343   free (bbs);
12344   if (worklist.is_empty ())
12345     return;
12346
12347   /* Loop has masked stores.  */
12348   while (!worklist.is_empty ())
12349     {
12350       gimple *last, *last_store;
12351       edge e, efalse;
12352       tree mask;
12353       basic_block store_bb, join_bb;
12354       gimple_stmt_iterator gsi_to;
12355       tree vdef, new_vdef;
12356       gphi *phi;
12357       tree vectype;
12358       tree zero;
12359
12360       last = worklist.pop ();
12361       mask = gimple_call_arg (last, 2);
12362       bb = gimple_bb (last);
12363       /* Create then_bb and if-then structure in CFG, then_bb belongs to
12364          the same loop as if_bb.  It could be different to LOOP when two
12365          level loop-nest is vectorized and mask_store belongs to the inner
12366          one.  */
12367       e = split_block (bb, last);
12368       bb_loop = bb->loop_father;
12369       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12370       join_bb = e->dest;
12371       store_bb = create_empty_bb (bb);
12372       add_bb_to_loop (store_bb, bb_loop);
12373       e->flags = EDGE_TRUE_VALUE;
12374       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12375       /* Put STORE_BB to likely part.  */
12376       efalse->probability = profile_probability::likely ();
12377       e->probability = efalse->probability.invert ();
12378       store_bb->count = efalse->count ();
12379       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12380       if (dom_info_available_p (CDI_DOMINATORS))
12381         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12382       if (dump_enabled_p ())
12383         dump_printf_loc (MSG_NOTE, vect_location,
12384                          "Create new block %d to sink mask stores.",
12385                          store_bb->index);
12386       /* Create vector comparison with boolean result.  */
12387       vectype = TREE_TYPE (mask);
12388       zero = build_zero_cst (vectype);
12389       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12390       gsi = gsi_last_bb (bb);
12391       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12392       /* Create new PHI node for vdef of the last masked store:
12393          .MEM_2 = VDEF <.MEM_1>
12394          will be converted to
12395          .MEM.3 = VDEF <.MEM_1>
12396          and new PHI node will be created in join bb
12397          .MEM_2 = PHI <.MEM_1, .MEM_3>
12398       */
12399       vdef = gimple_vdef (last);
12400       new_vdef = make_ssa_name (gimple_vop (cfun), last);
12401       gimple_set_vdef (last, new_vdef);
12402       phi = create_phi_node (vdef, join_bb);
12403       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12404
12405       /* Put all masked stores with the same mask to STORE_BB if possible.  */
12406       while (true)
12407         {
12408           gimple_stmt_iterator gsi_from;
12409           gimple *stmt1 = NULL;
12410
12411           /* Move masked store to STORE_BB.  */
12412           last_store = last;
12413           gsi = gsi_for_stmt (last);
12414           gsi_from = gsi;
12415           /* Shift GSI to the previous stmt for further traversal.  */
12416           gsi_prev (&gsi);
12417           gsi_to = gsi_start_bb (store_bb);
12418           gsi_move_before (&gsi_from, &gsi_to);
12419           /* Setup GSI_TO to the non-empty block start.  */
12420           gsi_to = gsi_start_bb (store_bb);
12421           if (dump_enabled_p ())
12422             dump_printf_loc (MSG_NOTE, vect_location,
12423                              "Move stmt to created bb\n%G", last);
12424           /* Move all stored value producers if possible.  */
12425           while (!gsi_end_p (gsi))
12426             {
12427               tree lhs;
12428               imm_use_iterator imm_iter;
12429               use_operand_p use_p;
12430               bool res;
12431
12432               /* Skip debug statements.  */
12433               if (is_gimple_debug (gsi_stmt (gsi)))
12434                 {
12435                   gsi_prev (&gsi);
12436                   continue;
12437                 }
12438               stmt1 = gsi_stmt (gsi);
12439               /* Do not consider statements writing to memory or having
12440                  volatile operand.  */
12441               if (gimple_vdef (stmt1)
12442                   || gimple_has_volatile_ops (stmt1))
12443                 break;
12444               gsi_from = gsi;
12445               gsi_prev (&gsi);
12446               lhs = gimple_get_lhs (stmt1);
12447               if (!lhs)
12448                 break;
12449
12450               /* LHS of vectorized stmt must be SSA_NAME.  */
12451               if (TREE_CODE (lhs) != SSA_NAME)
12452                 break;
12453
12454               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12455                 {
12456                   /* Remove dead scalar statement.  */
12457                   if (has_zero_uses (lhs))
12458                     {
12459                       gsi_remove (&gsi_from, true);
12460                       continue;
12461                     }
12462                 }
12463
12464               /* Check that LHS does not have uses outside of STORE_BB.  */
12465               res = true;
12466               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12467                 {
12468                   gimple *use_stmt;
12469                   use_stmt = USE_STMT (use_p);
12470                   if (is_gimple_debug (use_stmt))
12471                     continue;
12472                   if (gimple_bb (use_stmt) != store_bb)
12473                     {
12474                       res = false;
12475                       break;
12476                     }
12477                 }
12478               if (!res)
12479                 break;
12480
12481               if (gimple_vuse (stmt1)
12482                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
12483                 break;
12484
12485               /* Can move STMT1 to STORE_BB.  */
12486               if (dump_enabled_p ())
12487                 dump_printf_loc (MSG_NOTE, vect_location,
12488                                  "Move stmt to created bb\n%G", stmt1);
12489               gsi_move_before (&gsi_from, &gsi_to);
12490               /* Shift GSI_TO for further insertion.  */
12491               gsi_prev (&gsi_to);
12492             }
12493           /* Put other masked stores with the same mask to STORE_BB.  */
12494           if (worklist.is_empty ()
12495               || gimple_call_arg (worklist.last (), 2) != mask
12496               || worklist.last () != stmt1)
12497             break;
12498           last = worklist.pop ();
12499         }
12500       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12501     }
12502 }
12503
12504 /* Decide whether it is possible to use a zero-based induction variable
12505    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
12506    the value that the induction variable must be able to hold in order
12507    to ensure that the rgroups eventually have no active vector elements.
12508    Return -1 otherwise.  */
12509
12510 widest_int
12511 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12512 {
12513   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12514   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12515   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12516
12517   /* Calculate the value that the induction variable must be able
12518      to hit in order to ensure that we end the loop with an all-false mask.
12519      This involves adding the maximum number of inactive trailing scalar
12520      iterations.  */
12521   widest_int iv_limit = -1;
12522   if (max_loop_iterations (loop, &iv_limit))
12523     {
12524       if (niters_skip)
12525         {
12526           /* Add the maximum number of skipped iterations to the
12527              maximum iteration count.  */
12528           if (TREE_CODE (niters_skip) == INTEGER_CST)
12529             iv_limit += wi::to_widest (niters_skip);
12530           else
12531             iv_limit += max_vf - 1;
12532         }
12533       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12534         /* Make a conservatively-correct assumption.  */
12535         iv_limit += max_vf - 1;
12536
12537       /* IV_LIMIT is the maximum number of latch iterations, which is also
12538          the maximum in-range IV value.  Round this value down to the previous
12539          vector alignment boundary and then add an extra full iteration.  */
12540       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12541       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12542     }
12543   return iv_limit;
12544 }
12545
12546 /* For the given rgroup_controls RGC, check whether an induction variable
12547    would ever hit a value that produces a set of all-false masks or zero
12548    lengths before wrapping around.  Return true if it's possible to wrap
12549    around before hitting the desirable value, otherwise return false.  */
12550
12551 bool
12552 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12553 {
12554   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12555
12556   if (iv_limit == -1)
12557     return true;
12558
12559   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12560   unsigned int compare_precision = TYPE_PRECISION (compare_type);
12561   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12562
12563   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12564     return true;
12565
12566   return false;
12567 }